mirror of
https://github.com/richgel999/ufo_data.git
synced 2024-12-18 04:14:31 -05:00
1806 lines
58 KiB
C++
1806 lines
58 KiB
C++
// Copyright (C) 2023 Richard Geldreich, Jr.
|
|
#include "udb.h"
|
|
|
|
#include "udb_tables.h"
|
|
|
|
const uint32_t UDB_RECORD_SIZE = 112;
|
|
const uint32_t UDB_REC_TEXT_SIZE = 78;
|
|
|
|
enum
|
|
{
|
|
cFlagMAP, cFlagGND, cFlagCST, cFlagSEA, cFlagAIR, cFlagObsMIL, cFlagObsCIV, cFlagHQO, // loc/obs flags
|
|
cFlagSCI, cFlagTLP, cFlagNWS, cFlagMID, cFlagHOX, cFlagCNT, cFlagODD, cFlagWAV, // misc flags
|
|
cFlagSCR, cFlagCIG, cFlagDLT, cFlagNLT, cFlagPRB, cFlagFBL, cFlagSUB, cFlagNFO, // type of ufo craft flags
|
|
cFlagOID, cFlagRBT, cFlagPSH, cFlagMIB, cFlagMON, cFlagGNT, cFlagFIG, cFlagNOC, // aliens monsters flags
|
|
cFlagOBS, cFlagRAY, cFlagSMP, cFlagMST, cFlagABD, cFlagOPR, cFlagSIG, cFlagCVS, // apparent ufo occupant activities flags
|
|
cFlagNUC, cFlagDRT, cFlagVEG, cFlagANI, cFlagHUM, cFlagVEH, cFlagBLD, cFlagLND, // places visited and things affected flags
|
|
cFlagPHT, cFlagRDR, cFlagRDA, cFlagEME, cFlagTRC, cFlagTCH, cFlagHST, cFlagINJ, // evidence and special effects flags
|
|
cFlagMIL, cFlagBBK, cFlagGSA, cFlagOGA, cFlagSND, cFlagODR, cFlagCOV, cFlagCMF, // misc details flags
|
|
|
|
cTotalFlags = 64
|
|
};
|
|
|
|
#pragma pack(push, 1)
|
|
struct udb_rec
|
|
{
|
|
private:
|
|
int16_t m_year;
|
|
|
|
uint8_t m_unknown_and_locale; // nibbles
|
|
uint8_t m_unknown_and_month; // nibbles
|
|
uint8_t m_ref_index_high_day; // 3 bits ref index high, low 5 bits day
|
|
|
|
uint8_t m_time;
|
|
uint8_t m_ymdt; // 2-bit fields: TDMY accuracy, T lowest, 0=invalid, 1=?, 2=~, 3=accurate
|
|
uint8_t m_duration;
|
|
uint8_t m_unknown1;
|
|
|
|
int16_t m_enc_longtitude;
|
|
int16_t m_enc_latitude;
|
|
|
|
int16_t m_elevation;
|
|
int16_t m_rel_altitude;
|
|
|
|
uint8_t m_unknown2;
|
|
uint8_t m_continent_country; // nibbles
|
|
|
|
uint8_t m_state_or_prov[3];
|
|
|
|
uint8_t m_unknown3;
|
|
|
|
#if 0
|
|
uint8_t m_loc_flags;
|
|
uint8_t m_misc_flags;
|
|
uint8_t m_type_of_ufo_craft_flags;
|
|
uint8_t m_aliens_monsters_flags;
|
|
uint8_t m_apparent_ufo_occupant_activities_flags;
|
|
uint8_t m_places_visited_and_things_affected_flags;
|
|
uint8_t m_evidence_and_special_effects_flags;
|
|
uint8_t m_miscellaneous_details_flags;
|
|
#else
|
|
uint8_t m_flags[8];
|
|
#endif
|
|
|
|
uint8_t m_text[UDB_REC_TEXT_SIZE];
|
|
|
|
uint8_t m_reference;
|
|
uint8_t m_ref_index;
|
|
uint8_t m_strangeness_credibility; // nibbles
|
|
|
|
public:
|
|
const uint8_t* get_text() const { return m_text; }
|
|
|
|
int get_year() const { return m_year; }
|
|
|
|
uint32_t get_month() const { return m_unknown_and_month & 0xF; }
|
|
uint32_t get_day() const { return m_ref_index_high_day & 31; }
|
|
|
|
// meters
|
|
int get_elevation() const { return m_elevation; }
|
|
int get_rel_altitude() const { return m_rel_altitude; }
|
|
|
|
uint32_t get_strangeness() const { return m_strangeness_credibility >> 4; }
|
|
uint32_t get_credibility() const { return m_strangeness_credibility & 0xF; }
|
|
|
|
uint32_t get_reference() const { return m_reference; }
|
|
uint32_t get_reference_index() const { return m_ref_index | ((m_ref_index_high_day >> 5) << 8); }
|
|
|
|
uint32_t get_continent_code() const { return m_continent_country >> 4; }
|
|
uint32_t get_country_code() const { return m_continent_country & 0xF; }
|
|
|
|
uint32_t get_locale() const { return m_unknown_and_locale & 0xF; }
|
|
|
|
std::string get_state_or_prov() const
|
|
{
|
|
const uint32_t c0 = m_state_or_prov[0];
|
|
const uint32_t c1 = m_state_or_prov[1];
|
|
const uint32_t c2 = m_state_or_prov[2];
|
|
|
|
return dos_to_utf8(string_format("%c%c%c", (c0 >= ' ') ? c0 : ' ', (c1 >= ' ') ? c1 : ' ', (c2 >= ' ') ? c2 : ' '));
|
|
}
|
|
|
|
double get_latitude() const { return ((double)m_enc_latitude / 200.0f) * 1.11111111111f; }
|
|
double get_longitude() const { return -((double)m_enc_longtitude / 200.0f) * 1.11111111111f; }
|
|
|
|
std::string get_latitude_dms() const { double lat = get_latitude(); return get_deg_to_dms(lat) + ((lat <= 0) ? " S" : " N"); }
|
|
std::string get_longitude_dms() const { double lon = get_longitude(); return get_deg_to_dms(lon) + ((lon <= 0) ? " W" : " E"); }
|
|
|
|
// minutes
|
|
uint32_t get_duration() const { return m_duration; }
|
|
|
|
enum
|
|
{
|
|
cAccuracyInvalid = 0,
|
|
cAccuracyQuestionable = 1,
|
|
cAccuracyApproximate = 2,
|
|
cAccuracyGood = 3
|
|
};
|
|
|
|
bool get_time(std::string& time) const
|
|
{
|
|
uint32_t time_accuracy = m_ymdt & 3;
|
|
|
|
if (time_accuracy == cAccuracyInvalid)
|
|
return false;
|
|
|
|
uint32_t hour = m_time / 6;
|
|
uint32_t minute = (m_time % 6) * 10;
|
|
|
|
if (hour > 23)
|
|
{
|
|
assert(0);
|
|
return false;
|
|
}
|
|
|
|
time = string_format("%02u:%02u", hour, minute);
|
|
if (time_accuracy == cAccuracyQuestionable)
|
|
time += "?";
|
|
else if (time_accuracy == cAccuracyApproximate)
|
|
time = "~" + time;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool get_date(event_date& date) const
|
|
{
|
|
uint32_t year_accuracy = (m_ymdt >> 6) & 3;
|
|
uint32_t month_accuracy = (m_ymdt >> 4) & 3;
|
|
uint32_t day_accuracy = (m_ymdt >> 2) & 3;
|
|
|
|
int year = year_accuracy ? get_year() : 0;
|
|
uint32_t month = month_accuracy ? get_month() : 0;
|
|
uint32_t day = day_accuracy ? get_day() : 0;
|
|
|
|
if ((day < 1) || (day > 31))
|
|
{
|
|
day = 0;
|
|
day_accuracy = cAccuracyInvalid;
|
|
}
|
|
|
|
if ((month < 1) || (month > 12))
|
|
{
|
|
month = 0;
|
|
month_accuracy = cAccuracyInvalid;
|
|
}
|
|
|
|
if (!year)
|
|
return false;
|
|
|
|
uint32_t min_accuracy = year;
|
|
date.m_year = year;
|
|
|
|
if (month)
|
|
{
|
|
date.m_month = month;
|
|
|
|
if (!day)
|
|
{
|
|
min_accuracy = std::min(year_accuracy, month_accuracy);
|
|
}
|
|
else
|
|
{
|
|
min_accuracy = std::min(std::min(year_accuracy, month_accuracy), day_accuracy);
|
|
|
|
date.m_day = day;
|
|
}
|
|
}
|
|
|
|
if (min_accuracy == cAccuracyApproximate)
|
|
date.m_approx = true;
|
|
else if (min_accuracy == cAccuracyQuestionable)
|
|
date.m_fuzzy = true;
|
|
|
|
return true;
|
|
}
|
|
|
|
enum { cMaxFlags = 64 };
|
|
|
|
// LOC, MISC, TYPE, ALIENS/MONSTERS, ACTIVITIES, VISITED/THINGS, EVIDENCE/SPECIAL, MISC_DETAILS
|
|
bool get_flag(uint32_t index) const
|
|
{
|
|
assert(index < cMaxFlags);
|
|
return (m_flags[index >> 3] & (1 << (index & 7))) != 0;
|
|
}
|
|
|
|
#if 0
|
|
uint8_t get_loc_flags() const { return m_loc_flags; }
|
|
uint8_t get_misc_flags() const { return m_misc_flags; }
|
|
uint8_t get_type_of_ufo_craft_flags() const { return m_type_of_ufo_craft_flags; }
|
|
uint8_t get_aliens_monsters_flags() const { return m_aliens_monsters_flags; }
|
|
uint8_t get_apparent_ufo_occupant_activities_flags() const { return m_apparent_ufo_occupant_activities_flags; }
|
|
uint8_t get_places_visited_and_things_affected_flags() const { return m_places_visited_and_things_affected_flags; }
|
|
uint8_t get_evidence_and_special_effects_flags() const { return m_evidence_and_special_effects_flags; }
|
|
uint8_t get_miscellaneous_details_flags() const { return m_miscellaneous_details_flags; }
|
|
#endif
|
|
|
|
void get_geo(std::string& country_name, std::string& state_or_prov_name) const
|
|
{
|
|
std::string state_or_prov_str(get_state_or_prov());
|
|
string_trim_end(state_or_prov_str);
|
|
|
|
if (state_or_prov_str.back() == '.')
|
|
state_or_prov_str.pop_back();
|
|
|
|
if (state_or_prov_str.back() == '.')
|
|
state_or_prov_str.pop_back();
|
|
|
|
get_hatch_geo(get_continent_code(), get_country_code(), state_or_prov_str, country_name, state_or_prov_name);
|
|
|
|
if (state_or_prov_str == "UNK")
|
|
state_or_prov_name = "Unknown";
|
|
}
|
|
|
|
std::string get_full_refs() const
|
|
{
|
|
std::string ref(g_hatch_refs_tab[get_reference()]);
|
|
|
|
if (g_hatch_refs_tab[get_reference()])
|
|
{
|
|
uint32_t ref_index = get_reference_index();
|
|
|
|
if (get_reference() == 93)
|
|
{
|
|
for (const auto& x : g_hatch_refs_93)
|
|
if (x.m_ref == ref_index)
|
|
{
|
|
ref += x.m_pDesc;
|
|
break;
|
|
}
|
|
}
|
|
else if (get_reference() == 96)
|
|
{
|
|
for (const auto& x : g_hatch_refs_96)
|
|
if (x.m_ref == ref_index)
|
|
{
|
|
ref += x.m_pDesc;
|
|
break;
|
|
}
|
|
}
|
|
else if (get_reference() == 97)
|
|
{
|
|
for (const auto& x : g_hatch_refs_97)
|
|
if (x.m_ref == ref_index)
|
|
{
|
|
ref += x.m_pDesc;
|
|
break;
|
|
}
|
|
}
|
|
else if (get_reference() == 98)
|
|
{
|
|
for (const auto& x : g_hatch_refs_98)
|
|
if (x.m_ref == ref_index)
|
|
{
|
|
ref += x.m_pDesc;
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
ref += string_format(" (Index %u)", ref_index);
|
|
}
|
|
}
|
|
|
|
return ref;
|
|
}
|
|
};
|
|
#pragma pack(pop)
|
|
|
|
static std::unordered_map<std::string, std::string> g_dictionary;
|
|
|
|
struct token
|
|
{
|
|
std::string m_token;
|
|
bool m_cap_check;
|
|
bool m_replaced_flag;
|
|
|
|
token() :
|
|
m_cap_check(false),
|
|
m_replaced_flag(false)
|
|
{
|
|
}
|
|
|
|
token(const std::string& token, bool cap_check, bool replaced_flag) :
|
|
m_token(token),
|
|
m_cap_check(cap_check),
|
|
m_replaced_flag(replaced_flag)
|
|
{
|
|
}
|
|
};
|
|
|
|
std::unordered_set<std::string> g_unique_tokens;
|
|
std::vector<string_vec> g_hatch_exception_tokens;
|
|
|
|
static void init_hatch_cap_exception_tokens()
|
|
{
|
|
g_hatch_exception_tokens.resize(std::size(g_cap_exceptions));
|
|
|
|
std::string cur_etoken;
|
|
for (uint32_t e = 0; e < std::size(g_cap_exceptions); e++)
|
|
{
|
|
const std::string exception_str(g_cap_exceptions[e]);
|
|
|
|
string_vec& etokens = g_hatch_exception_tokens[e];
|
|
|
|
for (uint32_t i = 0; i < exception_str.size(); i++)
|
|
{
|
|
uint8_t c = exception_str[i];
|
|
|
|
if (c == ' ')
|
|
{
|
|
if (cur_etoken.size())
|
|
{
|
|
etokens.push_back(cur_etoken);
|
|
cur_etoken.clear();
|
|
}
|
|
}
|
|
else if (c == '-')
|
|
{
|
|
if (cur_etoken.size())
|
|
{
|
|
etokens.push_back(cur_etoken);
|
|
cur_etoken.clear();
|
|
}
|
|
|
|
std::string s;
|
|
s.push_back(c);
|
|
etokens.push_back(s);
|
|
}
|
|
else
|
|
{
|
|
cur_etoken.push_back(c);
|
|
}
|
|
}
|
|
|
|
if (cur_etoken.size())
|
|
{
|
|
etokens.push_back(cur_etoken);
|
|
|
|
cur_etoken.resize(0);
|
|
}
|
|
}
|
|
}
|
|
|
|
static std::string fix_capitilization(std::vector<token>& toks, uint32_t& tok_index)
|
|
{
|
|
if (toks[tok_index].m_replaced_flag)
|
|
return toks[tok_index].m_token;
|
|
|
|
const uint32_t toks_remaining = (uint32_t)toks.size() - tok_index;
|
|
|
|
// Peak ahead on the tokens to see if we need to correct any capitilization using the exception table.
|
|
for (uint32_t e = 0; e < std::size(g_cap_exceptions); e++)
|
|
{
|
|
const string_vec& etokens = g_hatch_exception_tokens[e];
|
|
|
|
if (toks_remaining >= etokens.size())
|
|
{
|
|
uint32_t i;
|
|
for (i = 0; i < etokens.size(); i++)
|
|
if ((string_icompare(etokens[i], toks[tok_index + i].m_token.c_str()) != 0) || toks[tok_index + i].m_replaced_flag)
|
|
break;
|
|
|
|
if (i == etokens.size())
|
|
{
|
|
for (i = 0; i < etokens.size(); i++)
|
|
{
|
|
toks[tok_index + i].m_token = etokens[i];
|
|
toks[tok_index + i].m_replaced_flag = true;
|
|
}
|
|
|
|
std::string res(toks[tok_index].m_token);
|
|
|
|
return res;
|
|
}
|
|
}
|
|
}
|
|
|
|
std::string str(toks[tok_index].m_token);
|
|
|
|
if (!toks[tok_index].m_cap_check)
|
|
return str;
|
|
|
|
string_vec wtokens;
|
|
std::string cur_wtoken;
|
|
|
|
for (uint32_t i = 0; i < str.size(); i++)
|
|
{
|
|
uint8_t c = str[i];
|
|
|
|
if (isalpha(c) || isdigit(c) || ((c == '\'') && (i != 0) && (i != str.size() - 1)))
|
|
{
|
|
cur_wtoken.push_back(c);
|
|
}
|
|
else
|
|
{
|
|
if (cur_wtoken.size())
|
|
{
|
|
wtokens.push_back(cur_wtoken);
|
|
cur_wtoken.clear();
|
|
}
|
|
|
|
std::string s;
|
|
s.push_back(c);
|
|
wtokens.push_back(s);
|
|
}
|
|
}
|
|
|
|
if (cur_wtoken.size())
|
|
{
|
|
wtokens.push_back(cur_wtoken);
|
|
cur_wtoken.clear();
|
|
}
|
|
|
|
for (uint32_t wtoken_index = 0; wtoken_index < wtokens.size(); wtoken_index++)
|
|
{
|
|
std::string& substr = wtokens[wtoken_index];
|
|
|
|
if (substr == "A")
|
|
substr = "a";
|
|
else if (substr.size() >= 2)
|
|
{
|
|
bool is_all_uppercase = true;
|
|
|
|
for (uint8_t c : substr)
|
|
{
|
|
if (!isupper(c) && (c != '\''))
|
|
{
|
|
is_all_uppercase = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (is_all_uppercase)
|
|
{
|
|
auto res = g_dictionary.find(string_lower(substr));
|
|
if (res != g_dictionary.end())
|
|
{
|
|
substr = res->second;
|
|
}
|
|
else
|
|
{
|
|
substr = string_lower(substr);
|
|
|
|
g_unique_tokens.insert(substr);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
std::string res;
|
|
for (uint32_t wtoken_index = 0; wtoken_index < wtokens.size(); wtoken_index++)
|
|
res += wtokens[wtoken_index];
|
|
|
|
return res;
|
|
}
|
|
|
|
static std::unordered_map<std::string, hatch_abbrev> g_hatch_abbreviations_map;
|
|
|
|
static void init_hatch_abbreviations_map()
|
|
{
|
|
for (uint32_t abbrev_index = 0; abbrev_index < std::size(g_hatch_abbreviations); abbrev_index++)
|
|
{
|
|
auto res = g_hatch_abbreviations_map.insert(std::make_pair(string_lower(g_hatch_abbreviations[abbrev_index].pAbbrev), g_hatch_abbreviations[abbrev_index]));
|
|
if (!res.second)
|
|
panic("Mutiple Hatch abbreviation: %s", res.first->first.c_str());
|
|
}
|
|
}
|
|
|
|
// Expand abbreviations
|
|
static void expand_abbreviations_internal(bool first_line, std::string orig_token, const string_vec& tokens, uint32_t cur_tokens_index, std::vector<token>& toks)
|
|
{
|
|
const uint32_t MAX_ABBREVS = 5;
|
|
|
|
uint32_t k;
|
|
for (k = 0; k < MAX_ABBREVS; k++)
|
|
{
|
|
std::string new_token(orig_token);
|
|
|
|
auto find_res = g_hatch_abbreviations_map.find(string_lower(orig_token));
|
|
if (find_res != g_hatch_abbreviations_map.end())
|
|
{
|
|
if (!first_line || !find_res->second.m_forbid_firstline)
|
|
{
|
|
new_token = find_res->second.pExpansion;
|
|
|
|
if (new_token.size())
|
|
toks.push_back(token(new_token, !first_line && (new_token == orig_token), false));
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
if ((orig_token.size() >= 4) && (uisupper(orig_token[0])))
|
|
{
|
|
std::string month_suffix(orig_token);
|
|
month_suffix.erase(0, 3);
|
|
|
|
if ((month_suffix.size() <= 4) && string_is_digits(month_suffix))
|
|
{
|
|
std::string month_prefix(orig_token);
|
|
month_prefix.erase(3, month_prefix.size() - 3);
|
|
std::string search_prefix(string_upper(month_prefix));
|
|
|
|
static const char* g_hmonths[12] =
|
|
{
|
|
"JAN", "FEB", "MAR", "APR", "MAY", "JUN",
|
|
"JLY", "AUG", "SEP", "OCT", "NOV", "DEC"
|
|
};
|
|
|
|
uint32_t m;
|
|
for (m = 0; m < 12; m++)
|
|
if (search_prefix == g_hmonths[m])
|
|
break;
|
|
|
|
if (m < 12)
|
|
{
|
|
toks.push_back(token(g_months[m], !first_line, false));
|
|
|
|
// TODO: This can be improved by checking the # before the token
|
|
long long val = atoll(month_suffix.c_str());
|
|
if (val > 31)
|
|
month_suffix = '\'' + month_suffix;
|
|
|
|
toks.push_back(token(month_suffix, !first_line, false));
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t p;
|
|
if ((p = orig_token.find_first_of('.')) == std::string::npos)
|
|
{
|
|
// No period(s) - we're done.
|
|
if (new_token.size())
|
|
toks.push_back(token(new_token, !first_line, false));
|
|
|
|
break;
|
|
}
|
|
|
|
// Specifically detect abbrev. first names like "A." etc. and expand them.
|
|
if (!first_line && (orig_token.size() > 4) && (p == 1) && uisupper(orig_token[0]) && uisupper(orig_token[2]))
|
|
{
|
|
std::string first_name(orig_token);
|
|
first_name.erase(2, first_name.size() - 2);
|
|
|
|
toks.push_back(token(first_name, false, false));
|
|
|
|
orig_token.erase(0, p + 1);
|
|
}
|
|
else
|
|
{
|
|
// Detect words starting with an abbreviation ending in "."
|
|
std::string prefix(orig_token);
|
|
|
|
prefix.erase(p + 1, prefix.size() - (p + 1));
|
|
|
|
find_res = g_hatch_abbreviations_map.find(string_lower(prefix));
|
|
|
|
if ((find_res != g_hatch_abbreviations_map.end()) && (!first_line || !find_res->second.m_forbid_firstline))
|
|
{
|
|
new_token = find_res->second.pExpansion;
|
|
|
|
toks.push_back(token(new_token, false, false));
|
|
|
|
orig_token.erase(0, p + 1);
|
|
}
|
|
else
|
|
{
|
|
if (new_token.size())
|
|
toks.push_back(token(new_token, !first_line, false));
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
} // k
|
|
|
|
if (k == MAX_ABBREVS)
|
|
{
|
|
if (orig_token.size())
|
|
toks.push_back(token(orig_token, !first_line, false));
|
|
}
|
|
}
|
|
|
|
static bool is_sentence_ender(uint8_t c)
|
|
{
|
|
return (c == '!') || (c == '.') || (c == '?');
|
|
}
|
|
|
|
static void expand_abbreviations(bool first_line, std::string orig_token, const string_vec& tokens, uint32_t cur_tokens_index, std::vector<token>& toks)
|
|
{
|
|
std::string new_token(orig_token);
|
|
|
|
// Temporarily remove " and ' prefix/suffix chars from the token, before the abbrev checks.
|
|
std::string prefix_char, suffix_char;
|
|
if (orig_token.size() >= 3)
|
|
{
|
|
if ((orig_token[0] == '\'') || (orig_token[0] == '\"'))
|
|
{
|
|
prefix_char.push_back(orig_token[0]);
|
|
orig_token.erase(0, 1);
|
|
new_token = orig_token;
|
|
}
|
|
|
|
if ((orig_token.back() == '\'') || (orig_token.back() == '\"'))
|
|
{
|
|
suffix_char.push_back(orig_token.back());
|
|
orig_token.pop_back();
|
|
new_token = orig_token;
|
|
}
|
|
}
|
|
|
|
const size_t first_tok = toks.size();
|
|
|
|
expand_abbreviations_internal(first_line, orig_token, tokens, cur_tokens_index, toks);
|
|
|
|
const size_t num_toks = toks.size() - first_tok;
|
|
assert(num_toks);
|
|
|
|
const size_t last_tok = first_tok + num_toks - 1;
|
|
|
|
if (prefix_char.size())
|
|
toks[first_tok].m_token = prefix_char + toks[first_tok].m_token;
|
|
|
|
if (suffix_char.size())
|
|
toks[last_tok].m_token = toks[last_tok].m_token + suffix_char;
|
|
|
|
}
|
|
|
|
static std::string decode_hatch(const std::string& str, bool first_line)
|
|
{
|
|
std::string res;
|
|
|
|
string_vec tokens;
|
|
std::string cur_token;
|
|
|
|
bool inside_space = false;
|
|
int prev_c = -1;
|
|
|
|
// Phase 1: Tokenize the input string based off examination of (mostly) individual chars, previous chars and upcoming individual chars.
|
|
for (uint32_t i = 0; i < str.size(); i++)
|
|
{
|
|
uint8_t c = str[i];
|
|
|
|
const bool is_two_dots = (c == '.') && ((i + 1) < str.size()) && (str[i + 1] == '.');
|
|
const bool is_one_equals = (c == '1') && ((i + 1) < str.size()) && (str[i + 1] == '=');
|
|
|
|
const bool prev_is_digit = i && uisdigit(str[i - 1]);
|
|
const bool next_is_plus = ((i + 1) < str.size()) && (str[i + 1] == '+');
|
|
|
|
//const bool has_prev = (i != 0);
|
|
//const bool has_next = (i + 1) < str.size();
|
|
|
|
if (c == ' ')
|
|
{
|
|
if (cur_token.size())
|
|
{
|
|
tokens.push_back(cur_token);
|
|
cur_token.clear();
|
|
}
|
|
|
|
inside_space = true;
|
|
}
|
|
else if (is_one_equals)
|
|
{
|
|
if (cur_token.size())
|
|
{
|
|
tokens.push_back(cur_token);
|
|
cur_token.clear();
|
|
}
|
|
|
|
tokens.push_back("1=");
|
|
i++;
|
|
|
|
inside_space = false;
|
|
}
|
|
else if (
|
|
(c == ';') || ((c >= 0x18) && (c <= 0x1b)) || (c == '<') || (c == '>') ||
|
|
(c == '=') ||
|
|
(c == '/') ||
|
|
(c == ',') ||
|
|
(c == '?') || (c == '!') ||
|
|
((!prev_is_digit || next_is_plus) && (c == '+')) ||
|
|
(c == '@') || (c == '-') ||
|
|
is_two_dots
|
|
)
|
|
{
|
|
if (cur_token.size())
|
|
{
|
|
tokens.push_back(cur_token);
|
|
cur_token.clear();
|
|
}
|
|
|
|
std::string s;
|
|
s.push_back(c);
|
|
|
|
if (is_two_dots)
|
|
{
|
|
s += ".";
|
|
i++;
|
|
}
|
|
|
|
tokens.push_back(s);
|
|
|
|
inside_space = false;
|
|
}
|
|
else
|
|
{
|
|
cur_token.push_back(c);
|
|
inside_space = false;
|
|
|
|
if ((c == 0xf8) || // code page 437 degree sym
|
|
(prev_is_digit && (c == '+') && !next_is_plus))
|
|
{
|
|
tokens.push_back(cur_token);
|
|
cur_token.clear();
|
|
}
|
|
}
|
|
|
|
prev_c = c;
|
|
}
|
|
|
|
if (cur_token.size())
|
|
tokens.push_back(cur_token);
|
|
|
|
// Phase 2: Exceptional fixups that change or split tokens up into multiple tokens.
|
|
string_vec new_tokens;
|
|
|
|
for (uint32_t i = 0; i < tokens.size(); i++)
|
|
{
|
|
std::string tok(tokens[i]);
|
|
|
|
// Convert "BBK#"
|
|
if (string_begins_with(tok, "BBK#") && (tok.size() > 4))
|
|
{
|
|
new_tokens.push_back("Project Bluebook Case #");
|
|
|
|
tok.erase(0, 4);
|
|
new_tokens.push_back(tok);
|
|
|
|
continue;
|
|
}
|
|
|
|
// Split "k'alt"
|
|
if (string_ends_in(tok, "k'alt"))
|
|
{
|
|
tok.erase(tok.size() - 3, 3);
|
|
new_tokens.push_back(tok);
|
|
|
|
new_tokens.push_back("Alt");
|
|
|
|
continue;
|
|
}
|
|
|
|
// Convert "HI+LO"
|
|
if ((i + 2 < tokens.size()) && (tokens[i] == "HI") && (tokens[i + 1] == "+") && (tokens[i + 2] == "LO"))
|
|
{
|
|
tokens.push_back("high and low");
|
|
i += 2;
|
|
continue;
|
|
}
|
|
|
|
// Don't split "4rth" to "4 rth" etc.
|
|
if ((string_icompare(tok, "4RTH") == 0) || (string_icompare(tok, "3rds") == 0) || (string_icompare(tok, "16th") == 0))
|
|
{
|
|
new_tokens.push_back(tok);
|
|
continue;
|
|
}
|
|
|
|
if (string_ends_in(tok, "Kmph"))
|
|
{
|
|
new_tokens.push_back(tok);
|
|
continue;
|
|
}
|
|
|
|
if (tok == "12Ocm")
|
|
{
|
|
new_tokens.push_back("120cm");
|
|
continue;
|
|
}
|
|
|
|
if (string_icompare(tok, "3OOM") == 0)
|
|
{
|
|
new_tokens.push_back("300m");
|
|
continue;
|
|
}
|
|
|
|
// If the first char isn't a digit then just continue now, because the rest of this code is concerned with splitting numbers away from words.
|
|
if (!isdigit(tok[0]))
|
|
{
|
|
new_tokens.push_back(tok);
|
|
continue;
|
|
}
|
|
|
|
if (tok.size() >= 3)
|
|
{
|
|
// Check for 1-7 digits then ' followed by 1- letters and split
|
|
uint32_t j;
|
|
for (j = 1; j < tok.size(); j++)
|
|
if (tok[j] == '\'')
|
|
break;
|
|
|
|
if ((j < tok.size()) && (j != tok.size() - 1) && (j <= 7))
|
|
{
|
|
uint32_t k;
|
|
for (k = 1; k < j; k++)
|
|
if (!uisdigit(tok[k]) && (utolower(tok[k]) != 'x') && (utolower(tok[k]) != 'k') && (tok[k] != '.'))
|
|
break;
|
|
|
|
if ((k == j) && (uisalpha(tok[j + 1])))
|
|
{
|
|
int sp = j + 1;
|
|
std::string new_tok(tok);
|
|
new_tok.erase(0, sp);
|
|
|
|
std::string n(tok);
|
|
|
|
n.erase(sp, n.size() - sp);
|
|
new_tokens.push_back(n);
|
|
|
|
new_tokens.push_back(new_tok);
|
|
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Won't split digits away for tokens < 4 chars
|
|
if ((tok.size() < 4) || (tok == "6F6s"))
|
|
{
|
|
new_tokens.push_back(tok);
|
|
continue;
|
|
}
|
|
|
|
// Check for 1-2 digits and alpha and split
|
|
// TODO: support 3-4 digits
|
|
int split_point = -1;
|
|
if (uisalpha(tok[1]))
|
|
split_point = 1;
|
|
else if (uisdigit(tok[1]) && uisalpha(tok[2]) && uisalpha(tok[3]))
|
|
split_point = 2;
|
|
|
|
if (split_point > 0)
|
|
{
|
|
std::string new_tok(tok);
|
|
new_tok.erase(0, split_point);
|
|
|
|
// Don't split the number digits from some special cases, like hr, cm, mph, etc.
|
|
if ((string_icompare(new_tok, "hr") != 0) &&
|
|
(string_icompare(new_tok, "nd") != 0) &&
|
|
(string_icompare(new_tok, "kw") != 0) &&
|
|
(string_icompare(new_tok, "cm") != 0) &&
|
|
(string_icompare(new_tok, "km") != 0) &&
|
|
(string_icompare(new_tok, "mph") != 0) &&
|
|
(string_icompare(new_tok, "kph") != 0) &&
|
|
(!string_begins_with(new_tok, "K'")))
|
|
{
|
|
std::string n(tok);
|
|
|
|
n.erase(split_point, n.size() - split_point);
|
|
new_tokens.push_back(n);
|
|
|
|
if (new_tok == "min")
|
|
new_tok = "minute(s)";
|
|
|
|
new_tokens.push_back(new_tok);
|
|
}
|
|
else
|
|
{
|
|
new_tokens.push_back(tok);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
new_tokens.push_back(tok);
|
|
}
|
|
}
|
|
|
|
tokens.swap(new_tokens);
|
|
|
|
std::vector<token> toks;
|
|
|
|
// Phase 3: Compose new string, expanding abbreviations and tokens to one or more words, or combining together special sequences of tokens into specific phrases.
|
|
// Also try to carefully insert spaces into the output, as needed.
|
|
for (uint32_t i = 0; i < tokens.size(); i++)
|
|
{
|
|
const uint32_t num_tokens_left = ((uint32_t)tokens.size() - 1) - i;
|
|
const bool has_prev_token = i > 0, has_next_token = (i + 1) < tokens.size();
|
|
const bool next_token_is_slash = (has_next_token) && (tokens[i + 1][0] == '/');
|
|
|
|
bool is_next_dir = false;
|
|
if (has_next_token)
|
|
{
|
|
uint32_t ofs = 1;
|
|
if (tokens[i + 1] == ">")
|
|
{
|
|
ofs = 2;
|
|
}
|
|
|
|
if ((i + ofs) < tokens.size())
|
|
{
|
|
std::string next_tok = string_upper(tokens[i + ofs]);
|
|
|
|
if ((next_tok.back() == '.') && (next_tok.size() >= 2))
|
|
next_tok.pop_back();
|
|
|
|
if ((next_tok == "N") || (next_tok == "S") || (next_tok == "E") || (next_tok == "W") ||
|
|
(next_tok == "SW") || (next_tok == "SE") || (next_tok == "NW") || (next_tok == "NE") ||
|
|
(next_tok == "NNE") || (next_tok == "NNW") || (next_tok == "SSE") || (next_tok == "SSW") ||
|
|
(next_tok == "ESE"))
|
|
{
|
|
is_next_dir = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
std::string orig_token(tokens[i]);
|
|
std::string new_token(orig_token);
|
|
|
|
if (!orig_token.size())
|
|
continue;
|
|
|
|
// Handle various exceptions before expending abbreviations
|
|
// TODO: Refactor to table(s)
|
|
|
|
// Special handling for RUSS/RUSS.
|
|
if ((tokens[i] == "RUSS") || (tokens[i] == "RUSS.") || (tokens[i] == "RUS") || (tokens[i] == "RUS."))
|
|
{
|
|
if (first_line)
|
|
new_token = "Russia";
|
|
else
|
|
new_token = "Russian";
|
|
}
|
|
// AA FLITE #519 - exception
|
|
// AA LINER
|
|
else if ((tokens[i] == "AA") && (num_tokens_left >= 1) && ((tokens[i + 1] == "FLITE#519") || (tokens[i + 1] == "LINER")))
|
|
{
|
|
new_token = "AA";
|
|
}
|
|
// bright Lt.
|
|
else if ((tokens[i] == "VBRITE") && (num_tokens_left >= 1) && (tokens[i + 1] == "LT"))
|
|
{
|
|
new_token = "vibrant bright light";
|
|
i++;
|
|
}
|
|
// ENERGY SRC
|
|
else if ((tokens[i] == "ENERGY") && (num_tokens_left >= 1) && (tokens[i + 1] == "SRC"))
|
|
{
|
|
new_token = "energy source";
|
|
i++;
|
|
}
|
|
// mid air - exception
|
|
else if ((tokens[i] == "MID") && (num_tokens_left >= 1) && (tokens[i + 1] == "AIR"))
|
|
{
|
|
new_token = "mid";
|
|
}
|
|
// /FORMN or /formation - exception
|
|
else if ((string_icompare(tokens[i], "/") == 0) && (num_tokens_left >= 1) && ((string_icompare(tokens[i + 1], "FORMN") == 0) || (string_icompare(tokens[i + 1], "formation") == 0)))
|
|
{
|
|
new_token = "in formation";
|
|
i++;
|
|
}
|
|
// /FORMNs - exception
|
|
else if ((string_icompare(tokens[i], "/") == 0) && (num_tokens_left >= 1) && ((string_icompare(tokens[i + 1], "FORMNs") == 0) || (string_icompare(tokens[i + 1], "formations") == 0)))
|
|
{
|
|
new_token = "in formations";
|
|
i++;
|
|
}
|
|
// LOST/CLOUDS - exception
|
|
else if ((string_icompare(tokens[i], "LOST") == 0) && (num_tokens_left >= 2) && (tokens[i + 1] == "/") && (string_icompare(tokens[i + 2], "CLOUDS") == 0))
|
|
{
|
|
new_token = "lost in clouds";
|
|
i += 2;
|
|
}
|
|
// LOST/DISTANCE - exception
|
|
else if ((string_icompare(tokens[i], "LOST") == 0) && (num_tokens_left >= 2) && (tokens[i + 1] == "/") && (string_icompare(tokens[i + 2], "DISTANCE") == 0))
|
|
{
|
|
new_token = "lost in the distance";
|
|
i += 2;
|
|
}
|
|
// W-carbide - exception
|
|
else if ((string_icompare(tokens[i], "W") == 0) && (num_tokens_left >= 2) && (tokens[i + 1] == "-") && (string_icompare(tokens[i + 2], "carbide") == 0))
|
|
{
|
|
new_token = "W";
|
|
}
|
|
// S-SHAPE - exception
|
|
else if ((tokens[i] == "S") && (num_tokens_left >= 2) && (tokens[i + 1] == "-") && (tokens[i + 2] == "SHAPE"))
|
|
{
|
|
new_token = "S";
|
|
}
|
|
// mid-sky - exception
|
|
else if ((tokens[i] == "MID") && (num_tokens_left >= 2) && (tokens[i + 1] == "-") && (tokens[i + 2] == "SKY"))
|
|
{
|
|
new_token = "mid";
|
|
}
|
|
// mid-flite - exception
|
|
else if ((tokens[i] == "MID") && (num_tokens_left >= 2) && (tokens[i + 1] == "-") && (tokens[i + 2] == "FLITE"))
|
|
{
|
|
new_token = "mid";
|
|
}
|
|
// mid-city - exception
|
|
else if ((tokens[i] == "MID") && (num_tokens_left >= 2) && (tokens[i + 1] == "-") && (tokens[i + 2] == "CITY"))
|
|
{
|
|
new_token = "mid";
|
|
}
|
|
// W vee - exception
|
|
else if ((tokens[i] == "W") && (num_tokens_left >= 1) && (tokens[i + 1] == "VEE"))
|
|
{
|
|
new_token = "with vee";
|
|
i++;
|
|
}
|
|
// Lake Mi - exception
|
|
else if ((tokens[i] == "LAKE") && (num_tokens_left >= 1) && (tokens[i + 1] == "Mi"))
|
|
{
|
|
new_token = "Lake Michigan";
|
|
i++;
|
|
}
|
|
// SCI-FI
|
|
else if ((tokens[i] == "SCI") && (num_tokens_left >= 2) && (tokens[i + 1] == "-") && (tokens[i + 2] == "FI"))
|
|
{
|
|
new_token = "Sci-Fi";
|
|
i += 2;
|
|
}
|
|
// V-tall
|
|
else if ((tokens[i] == "V") && (num_tokens_left >= 2) && (tokens[i + 1] == "-") && (tokens[i + 2] == "TALL"))
|
|
{
|
|
new_token = "very tall";
|
|
i += 2;
|
|
}
|
|
// 1 OBS/1 OBS. at beginning
|
|
else if ((i == 1) && (tokens[0] == "1") && (tokens[1] == "OBS" || tokens[1] == "OBS."))
|
|
{
|
|
new_token = "observer";
|
|
}
|
|
// CLR WEATHER exception
|
|
else if ((num_tokens_left >= 1) && (tokens[i] == "CLR") && (tokens[i + 1] == "WEATHER"))
|
|
{
|
|
new_token = "clear";
|
|
}
|
|
// WATER DOMES exception (typo fix)
|
|
else if ((num_tokens_left >= 1) && (string_icompare(tokens[i], "WATER") == 0) && (string_icompare(tokens[i + 1], "DOMES") == 0))
|
|
{
|
|
new_token = "water comes";
|
|
i++;
|
|
}
|
|
// W dome exception
|
|
else if ((num_tokens_left >= 1) && (string_icompare(tokens[i], "W") == 0) && (string_icompare(tokens[i + 1], "DOME") == 0))
|
|
{
|
|
new_token = "with";
|
|
}
|
|
// CLR SKY exception
|
|
else if ((num_tokens_left >= 1) && (string_icompare(tokens[i], "CLR") == 0) && (string_icompare(tokens[i + 1], "SKY") == 0))
|
|
{
|
|
new_token = "clear";
|
|
}
|
|
// CLR DOME exception
|
|
else if ((num_tokens_left >= 1) && (string_icompare(tokens[i], "CLR") == 0) && (string_icompare(tokens[i + 1], "DOME") == 0))
|
|
{
|
|
new_token = "clear";
|
|
}
|
|
// CLR DOMED exception
|
|
else if ((num_tokens_left >= 2) && (string_icompare(tokens[i], "CLR") == 0) && (tokens[i + 1] == "-") && (string_icompare(tokens[i + 2], "DOMED") == 0))
|
|
{
|
|
new_token = "clear";
|
|
}
|
|
// CLR DOME exception
|
|
else if ((num_tokens_left >= 2) && (string_icompare(tokens[i], "CLR") == 0) && (tokens[i + 1] == "-") && (string_icompare(tokens[i + 2], "DOME") == 0))
|
|
{
|
|
new_token = "clear";
|
|
}
|
|
// CLR RDR exception
|
|
else if ((num_tokens_left >= 1) && (string_icompare(tokens[i], "CLR") == 0) && (string_icompare(tokens[i + 1], "RDR") == 0))
|
|
{
|
|
new_token = "clear";
|
|
}
|
|
// CLR CLOCKPIT exception
|
|
else if ((num_tokens_left >= 1) && (string_icompare(tokens[i], "CLR") == 0) && (string_icompare(tokens[i + 1], "COCKPIT") == 0))
|
|
{
|
|
new_token = "clear";
|
|
}
|
|
// CLR TORUS exception
|
|
else if ((num_tokens_left >= 1) && (string_icompare(tokens[i], "CLR") == 0) && (string_icompare(tokens[i + 1], "TORUS") == 0))
|
|
{
|
|
new_token = "clear";
|
|
}
|
|
// CLR DAY exception
|
|
else if ((num_tokens_left >= 1) && (string_icompare(tokens[i], "CLR") == 0) && (string_icompare(tokens[i + 1], "DAY") == 0))
|
|
{
|
|
new_token = "clear";
|
|
}
|
|
// CLR PLASTIC exception
|
|
else if ((num_tokens_left >= 1) && (string_icompare(tokens[i], "CLR") == 0) && (string_icompare(tokens[i + 1], "PLASTIC") == 0))
|
|
{
|
|
new_token = "clear";
|
|
}
|
|
// CLR FOTOS exception (a guess, need to verify)
|
|
else if ((num_tokens_left >= 1) && (string_icompare(tokens[i], "CLR") == 0) && (string_icompare(tokens[i + 1], "FOTOS") == 0))
|
|
{
|
|
new_token = "clear";
|
|
}
|
|
// CLR FOTO exception (a guess, need to verify)
|
|
else if ((num_tokens_left >= 1) && (string_icompare(tokens[i], "CLR") == 0) && (string_icompare(tokens[i + 1], "FOTO") == 0))
|
|
{
|
|
new_token = "clear";
|
|
}
|
|
// CLR SHOT exception (a guess, need to verify)
|
|
else if ((num_tokens_left >= 1) && (string_icompare(tokens[i], "CLR") == 0) && (string_icompare(tokens[i + 1], "SHOT") == 0))
|
|
{
|
|
new_token = "clear";
|
|
}
|
|
// CLR BLUE exception
|
|
else if ((num_tokens_left >= 1) && (string_icompare(tokens[i], "CLR") == 0) && (string_icompare(tokens[i + 1], "BLUE") == 0))
|
|
{
|
|
new_token = "clear";
|
|
}
|
|
// CLR BUBBLE exception
|
|
else if ((num_tokens_left >= 1) && (string_icompare(tokens[i], "CLR") == 0) && (string_icompare(tokens[i + 1], "BUBBLE") == 0))
|
|
{
|
|
new_token = "clear";
|
|
}
|
|
// CLR BUBBLES exception
|
|
else if ((num_tokens_left >= 1) && (string_icompare(tokens[i], "CLR") == 0) && (string_icompare(tokens[i + 1], "BUBBLES") == 0))
|
|
{
|
|
new_token = "clear";
|
|
}
|
|
// S+Cu exception
|
|
else if ((num_tokens_left >= 2) && (tokens[i] == "S") && (tokens[i + 1] == "+") && (tokens[i + 2] == "Cu"))
|
|
{
|
|
new_token = "S";
|
|
}
|
|
// IND OBS exception
|
|
else if ((num_tokens_left >= 1) && (tokens[i] == "IND") && (tokens[i + 1] == "OBS"))
|
|
{
|
|
new_token = "independent";
|
|
}
|
|
// L<>R
|
|
else if ((num_tokens_left >= 3) && (tokens[i] == "L") && (tokens[i + 1] == "<") && (tokens[i + 2] == ">") && (tokens[i + 3] == "R"))
|
|
{
|
|
new_token = "left and right";
|
|
i += 3;
|
|
}
|
|
// <+>
|
|
else if ((num_tokens_left >= 2) && (tokens[i] == "<") && (tokens[i + 1] == "+") && (tokens[i + 2] == ">"))
|
|
{
|
|
new_token = "left and right";
|
|
i += 2;
|
|
}
|
|
else if (orig_token == "NFD")
|
|
{
|
|
if ((!has_next_token) || next_token_is_slash)
|
|
new_token = "No further details";
|
|
else
|
|
new_token = "No further details [in]";
|
|
}
|
|
// Up and down arrows
|
|
else if ((orig_token[0] == 0x18) &&
|
|
((i + 1) < tokens.size()) && (tokens[i + 1][0] == '+') &&
|
|
((i + 2) < tokens.size()) && (tokens[i + 2][0] == 0x19))
|
|
{
|
|
const uint32_t at_end = ((i + 3) == tokens.size()) || (tokens[i + 3][0] == '/');
|
|
new_token = !at_end ? "going up and down [to]" : "going up and down";
|
|
i += 2;
|
|
}
|
|
// "V BRITE"
|
|
else if ((orig_token == "V") && ((i + 1) < tokens.size()) && (tokens[i + 1] == "BRITE"))
|
|
{
|
|
new_token = "very bright";
|
|
i++;
|
|
}
|
|
// ++
|
|
else if ((orig_token == "+") && ((i + 1) < tokens.size()) && (tokens[i + 1] == "+"))
|
|
{
|
|
new_token = "and more/others";
|
|
i++;
|
|
}
|
|
// >>
|
|
else if ((orig_token == ">") && ((i + 1) < tokens.size()) && (tokens[i + 1] == ">"))
|
|
{
|
|
const uint32_t at_end = ((i + 2) == tokens.size()) || (tokens[i + 2][0] == '/');
|
|
new_token = (!at_end && !is_next_dir) ? "going quickly [to]" : "going quickly";
|
|
i++;
|
|
}
|
|
// ><
|
|
else if ((orig_token == ">") && ((i + 1) < tokens.size()) && (tokens[i + 1] == "<"))
|
|
{
|
|
new_token = "to/from";
|
|
i++;
|
|
}
|
|
// <>
|
|
else if ((orig_token == "<") && ((i + 1) < tokens.size()) && (tokens[i + 1] == ">"))
|
|
{
|
|
// Larry said "between" but that sounds awkward and would require reordering tokens.
|
|
new_token = "to/from/between";
|
|
i++;
|
|
}
|
|
// >
|
|
else if (orig_token == ">")
|
|
{
|
|
new_token = (has_next_token && !next_token_is_slash && !is_next_dir) ? "going [to]" : "going";
|
|
}
|
|
// Tree up arrows
|
|
else if ((orig_token[0] == 0x18) && (num_tokens_left >= 2) && (tokens[i + 1][0] == 0x18) && (tokens[i + 2][0] == 0x18))
|
|
{
|
|
const uint32_t at_end = ((i + 3) == tokens.size()) || (tokens[i + 3][0] == '/');
|
|
new_token = !at_end ? "extremely quickly going up [to]" : "extremely quickly going up";
|
|
i += 2;
|
|
}
|
|
// Two up arrows
|
|
else if ((orig_token[0] == 0x18) && ((i + 1) < tokens.size()) && (tokens[i + 1][0] == 0x18))
|
|
{
|
|
const uint32_t at_end = ((i + 2) == tokens.size()) || (tokens[i + 2][0] == '/');
|
|
new_token = !at_end ? "quickly going up [to]" : "quickly going up";
|
|
i++;
|
|
}
|
|
// Up arrow
|
|
else if (orig_token[0] == 0x18)
|
|
{
|
|
new_token = (has_next_token && !next_token_is_slash) ? "going up [to]" : "going up";
|
|
}
|
|
// Two down arrows
|
|
else if ((orig_token[0] == 0x19) && ((i + 1) < tokens.size()) && (tokens[i + 1][0] == 0x19))
|
|
{
|
|
const uint32_t at_end = ((i + 2) == tokens.size()) || (tokens[i + 2][0] == '/');
|
|
new_token = !at_end ? "quickly going down [to]" : "quickly going down";
|
|
i++;
|
|
}
|
|
// Down arrow
|
|
else if (orig_token[0] == 0x19)
|
|
{
|
|
new_token = (has_next_token && !next_token_is_slash) ? "going down [to]" : "going down";
|
|
}
|
|
// Two right arrows
|
|
else if ((orig_token[0] == 0x1A) && ((i + 1) < tokens.size()) && (tokens[i + 1][0] == 0x1A))
|
|
{
|
|
const uint32_t at_end = ((i + 2) == tokens.size()) || (tokens[i + 2][0] == '/');
|
|
new_token = !at_end ? "quickly going right [to]" : "quickly going right";
|
|
i++;
|
|
}
|
|
// Right arrow
|
|
else if (orig_token[0] == 0x1A)
|
|
{
|
|
new_token = (has_next_token && !next_token_is_slash) ? "going right [to]" : "going right";
|
|
}
|
|
// Two left arrows
|
|
else if ((orig_token[0] == 0x1B) && ((i + 1) < tokens.size()) && (tokens[i + 1][0] == 0x1B))
|
|
{
|
|
const uint32_t at_end = ((i + 2) == tokens.size()) || (tokens[i + 2][0] == '/');
|
|
new_token = !at_end ? "quickly going left [to]" : "quickly going left";
|
|
i++;
|
|
}
|
|
// Left arrow
|
|
else if (orig_token[0] == 0x1B)
|
|
{
|
|
new_token = (has_next_token && !next_token_is_slash) ? "going left [to]" : "going left";
|
|
}
|
|
// /
|
|
else if (orig_token[0] == '/')
|
|
{
|
|
new_token = "/";
|
|
}
|
|
// +
|
|
else if (orig_token[0] == '+')
|
|
{
|
|
if (!i)
|
|
new_token = "also";
|
|
else if ((i != (tokens.size() - 1)) && (tokens[i + 1][0] != '/'))
|
|
new_token = "and";
|
|
else
|
|
new_token = "and more";
|
|
}
|
|
// @
|
|
else if (orig_token[0] == '@')
|
|
{
|
|
new_token = "at";
|
|
}
|
|
// dbl-word
|
|
else if ((string_icompare(orig_token, "dbl") == 0) && ((i + 1) < tokens.size()) && (tokens[i + 1] == "-"))
|
|
{
|
|
new_token = "double";
|
|
}
|
|
// GLOW-word
|
|
else if ((string_icompare(orig_token, "GLOW") == 0) && ((i + 1) < tokens.size()) && (tokens[i + 1] == "-"))
|
|
{
|
|
new_token = "glowing";
|
|
}
|
|
// A-test
|
|
else if ((orig_token == "A") && ((i + 1) < tokens.size()) && (tokens[i + 1] == "-") &&
|
|
((i + 2) < tokens.size()) && (string_icompare(tokens[i + 2], "TEST") == 0))
|
|
{
|
|
new_token = "atomic test";
|
|
i += 2;
|
|
}
|
|
// A-plant
|
|
else if ((orig_token == "A") && ((i + 1) < tokens.size()) && (tokens[i + 1] == "-") &&
|
|
((i + 2) < tokens.size()) && (string_icompare(tokens[i + 2], "PLANT") == 0))
|
|
{
|
|
new_token = "atomic plant";
|
|
i += 2;
|
|
}
|
|
// V-form
|
|
else if ((orig_token == "V") && ((i + 1) < tokens.size()) && (tokens[i + 1] == "-") &&
|
|
((i + 2) < tokens.size()) && (string_icompare(tokens[i + 2], "FORM") == 0))
|
|
{
|
|
new_token = "V-formation";
|
|
i += 2;
|
|
}
|
|
// 1/2 (to fix spacing issues)
|
|
else if ((orig_token == "1") && ((i + 1) < tokens.size()) && (tokens[i + 1] == "/") &&
|
|
((i + 2) < tokens.size()) && (tokens[i + 2] == "2"))
|
|
{
|
|
new_token = "1/2";
|
|
i += 2;
|
|
}
|
|
// "W/O"
|
|
else if ((i) &&
|
|
(string_icompare(orig_token, "W") == 0) &&
|
|
((i + 1) < tokens.size()) && (tokens[i + 1] == "/") &&
|
|
((i + 2) < tokens.size()) && (string_icompare(tokens[i + 2], "O") == 0))
|
|
{
|
|
new_token = "without";
|
|
i += 2;
|
|
}
|
|
// "S/L"
|
|
else if ((orig_token == "S") &&
|
|
((i + 1) < tokens.size()) && (tokens[i + 1] == "/") &&
|
|
((i + 2) < tokens.size()) && (tokens[i + 2] == "L"))
|
|
{
|
|
// No idea what this means yet.
|
|
new_token = "straight and level";
|
|
i += 2;
|
|
}
|
|
// "FOO-FIGHTERS"
|
|
else if ((orig_token == "FOO") &&
|
|
((i + 1) < tokens.size()) && (tokens[i + 1] == "-") &&
|
|
((i + 2) < tokens.size()) && (tokens[i + 2] == "FIGHTERS"))
|
|
{
|
|
// Just don't let the abbreviator kick in. Thanks Larry.
|
|
}
|
|
// "W/word"
|
|
else if ((i) &&
|
|
((orig_token == "W") || (orig_token == "w")) &&
|
|
((i + 1) < tokens.size()) && (tokens[i + 1] == "/") &&
|
|
(tokens[i - 1] != ">") &&
|
|
(tokens[i - 1] != "<"))
|
|
{
|
|
new_token = "with";
|
|
i++;
|
|
}
|
|
// "1="
|
|
else if (orig_token == "1=")
|
|
{
|
|
new_token = "one is [a]";
|
|
}
|
|
// Exception for "ORG RPT".
|
|
else if ((orig_token == "ORG") && has_next_token && (tokens[i + 1] == "RPT"))
|
|
{
|
|
new_token = "original";
|
|
}
|
|
// TODO: check for line 1 and don't expand these states
|
|
// Exception for ,MT (the state) - don't change to "Mt."
|
|
else if (first_line && orig_token == "MI" && has_prev_token && tokens[i - 1] == ",")
|
|
{
|
|
}
|
|
// Exception for ,MT (the state) - don't change to "Mt."
|
|
else if (first_line && orig_token == "MT" && has_prev_token && tokens[i - 1] == ",")
|
|
{
|
|
}
|
|
// Exception for ,NE (the state) - don't change to "northeast"
|
|
else if (first_line && orig_token == "NE" && has_prev_token && tokens[i - 1] == ",")
|
|
{
|
|
}
|
|
// Exception for ,MS (the state) - don't change to "northeast"
|
|
else if (first_line && orig_token == "MS" && has_prev_token && tokens[i - 1] == ",")
|
|
{
|
|
}
|
|
// Exception for ,AL (the state) - don't change to "northeast"
|
|
else if (first_line && orig_token == "AL" && has_prev_token && tokens[i - 1] == ",")
|
|
{
|
|
}
|
|
else
|
|
{
|
|
expand_abbreviations(first_line, orig_token, tokens, i, toks);
|
|
continue;
|
|
}
|
|
|
|
if (new_token.size())
|
|
toks.push_back(token(new_token, !first_line && (new_token == tokens[i]), false));
|
|
}
|
|
|
|
// Phase 4: Compose the final string, converting tokens to lower/uppercase and inserting spaces as needed.
|
|
std::string new_str;
|
|
|
|
bool in_quote = false;
|
|
|
|
for (uint32_t i = 0; i < toks.size(); i++)
|
|
{
|
|
std::string new_token(toks[i].m_token);
|
|
if (!new_token.size())
|
|
continue;
|
|
|
|
if (!first_line)
|
|
new_token = fix_capitilization(toks, i);
|
|
|
|
// Add a space if the previous string is not empty - excluding special cases where a space isn't necessary.
|
|
if (new_str.size() &&
|
|
(new_token != "..") &&
|
|
(new_token != ",") &&
|
|
(new_token != "!") && (new_token != "?") &&
|
|
(new_token != "+") &&
|
|
(!((new_token == ")") && (new_str.back() == '?'))) &&
|
|
(new_token != ";") && (new_str.back() != ';') &&
|
|
(new_token != "-") && (new_str.back() != '-') &&
|
|
(new_str.back() != '#') &&
|
|
(new_str.back() != '+') &&
|
|
(!(in_quote && (new_token == "\"") && new_str.size() && is_sentence_ender(new_str.back())))
|
|
)
|
|
{
|
|
new_str.push_back(' ');
|
|
//new_str.push_back('*');
|
|
}
|
|
|
|
// Append the token string to the output string
|
|
new_str += new_token;
|
|
|
|
for (uint8_t c : new_token)
|
|
if (c == '\"')
|
|
in_quote = !in_quote;
|
|
}
|
|
|
|
return new_str;
|
|
}
|
|
|
|
static void decode_hatch_desc(const udb_rec* pRec, std::string& db_str, std::string& loc_str, std::string& desc_str)
|
|
{
|
|
for (uint32_t i = 0; i < UDB_REC_TEXT_SIZE; i++)
|
|
{
|
|
if (pRec->get_text()[i] == 0)
|
|
break;
|
|
db_str.push_back(pRec->get_text()[i]);
|
|
}
|
|
|
|
std::string orig_desc(db_str);
|
|
string_vec desc;
|
|
for (; ; )
|
|
{
|
|
size_t pos = orig_desc.find_first_of(':');
|
|
if (pos == std::string::npos)
|
|
{
|
|
desc.push_back(string_trim(orig_desc));
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
std::string s(orig_desc);
|
|
s.erase(pos, s.size() - pos);
|
|
desc.push_back(string_trim(s));
|
|
|
|
orig_desc.erase(0, pos + 1);
|
|
}
|
|
}
|
|
|
|
for (uint32_t i = 0; i < desc.size(); i++)
|
|
{
|
|
std::string str(decode_hatch(desc[i], !i));
|
|
if (!str.size())
|
|
continue;
|
|
|
|
if (desc_str.size())
|
|
{
|
|
if (desc_str.back() != '.' && desc_str.back() != '!' && desc_str.back() != '?')
|
|
desc_str += ".";
|
|
|
|
desc_str += " ";
|
|
}
|
|
|
|
if (!i)
|
|
{
|
|
loc_str = string_upper(str);
|
|
}
|
|
else
|
|
{
|
|
if (uislower(str[0]))
|
|
str[0] = utoupper(str[0]);
|
|
else if ((str[0] == '\"') && (str.size() >= 2) && (uislower(str[1])))
|
|
str[1] = utoupper(str[1]);
|
|
else if ((str[0] == '\'') && (str.size() >= 2) && (uislower(str[1])))
|
|
str[1] = utoupper(str[1]);
|
|
else if ((str[0] == '(') && (str.size() >= 2) && (uislower(str[1])))
|
|
str[1] = utoupper(str[1]);
|
|
|
|
desc_str += str;
|
|
}
|
|
}
|
|
|
|
if (desc_str.size() && desc_str.back() != '.' && desc_str.back() != '!' && desc_str.back() != '?')
|
|
{
|
|
if ((desc_str.back() == ')') && (!string_ends_in(desc_str, "(s)")))
|
|
{
|
|
desc_str.pop_back();
|
|
if (desc_str.back() == ' ')
|
|
desc_str.pop_back();
|
|
|
|
if (desc_str.size() && desc_str.back() != '.' && desc_str.back() != '!' && desc_str.back() != '?')
|
|
desc_str += ".";
|
|
|
|
desc_str += ")";
|
|
}
|
|
else
|
|
{
|
|
desc_str += ".";
|
|
}
|
|
}
|
|
|
|
db_str = dos_to_utf8(db_str);
|
|
loc_str = dos_to_utf8(loc_str);
|
|
desc_str = dos_to_utf8(desc_str);
|
|
}
|
|
|
|
template<typename T>
|
|
static void check_for_hatch_tab_dups(const T& tab)
|
|
{
|
|
std::unordered_set<int> ids;
|
|
for (const auto& x : tab)
|
|
if (!ids.insert(x.m_ref).second)
|
|
panic("Duplicate hatch ref table id");
|
|
}
|
|
|
|
static void init_dict()
|
|
{
|
|
string_vec dict;
|
|
|
|
uprintf("Reading dictionary\n");
|
|
bool utf8_flag = false;
|
|
if (!read_text_file("uppercase_dict.txt", dict, true, &utf8_flag))
|
|
panic("Failed reading uppercase_dict.txt");
|
|
|
|
for (auto str : dict)
|
|
{
|
|
string_trim(str);
|
|
if (str.size() && uisupper(str[0]))
|
|
{
|
|
g_dictionary.insert(std::make_pair(string_lower(str), str));
|
|
}
|
|
}
|
|
|
|
uprintf("Done reading dictionary, %u uppercase words\n", g_dictionary.size());
|
|
}
|
|
|
|
void udb_init()
|
|
{
|
|
assert(sizeof(udb_rec) == UDB_RECORD_SIZE);
|
|
|
|
check_for_hatch_tab_dups(g_hatch_refs);
|
|
check_for_hatch_tab_dups(g_hatch_refs_93);
|
|
check_for_hatch_tab_dups(g_hatch_refs_96);
|
|
check_for_hatch_tab_dups(g_hatch_refs_97);
|
|
check_for_hatch_tab_dups(g_hatch_refs_98);
|
|
|
|
for (uint32_t i = 0; i < std::size(g_hatch_refs); i++)
|
|
g_hatch_refs_tab[g_hatch_refs[i].m_ref] = g_hatch_refs[i].m_pDesc;
|
|
|
|
init_hatch_abbreviations_map();
|
|
init_hatch_cap_exception_tokens();
|
|
init_dict();
|
|
}
|
|
|
|
bool udb_dump()
|
|
{
|
|
uint8_vec udb;
|
|
if (!read_binary_file("u.rnd", udb))
|
|
return false;
|
|
|
|
const uint32_t TOTAL_RECS = 18123;
|
|
if ((udb.size() / UDB_RECORD_SIZE) < TOTAL_RECS)
|
|
panic("Invalid file size");
|
|
|
|
string_vec output;
|
|
|
|
const udb_rec* pRecs = reinterpret_cast<const udb_rec*>(&udb.front());
|
|
for (uint32_t rec_index = 1; rec_index < TOTAL_RECS; rec_index++)
|
|
//for (uint32_t rec_index = 18038; rec_index <= 18038; rec_index++)
|
|
{
|
|
const udb_rec* pRec = pRecs + rec_index;
|
|
|
|
std::string db_str, loc_str, desc_str;
|
|
decode_hatch_desc(pRec, db_str, loc_str, desc_str);
|
|
|
|
event_date ed;
|
|
pRec->get_date(ed);
|
|
std::string date_str(ed.get_string());
|
|
|
|
{
|
|
uprintf("\n----------%u: Date: %s, Strangeness: %u, Credibility: %u\n", rec_index, date_str.c_str(), pRec->get_strangeness(), pRec->get_credibility());
|
|
std::string time;
|
|
if (pRec->get_time(time))
|
|
uprintf("Time: %s\n", time.c_str());
|
|
|
|
if (pRec->get_duration())
|
|
uprintf("Duration: %u mins\n", pRec->get_duration());
|
|
|
|
if (pRec->get_elevation() != -99)
|
|
uprintf("Elevation: %im\n", pRec->get_elevation());
|
|
|
|
if ((pRec->get_rel_altitude() != 0) && (pRec->get_rel_altitude() != 999))
|
|
uprintf("Altitude: %im\n", pRec->get_rel_altitude());
|
|
|
|
uprintf("Location: %s\n", loc_str.c_str());
|
|
|
|
std::string country_name, state_or_prov_name;
|
|
pRec->get_geo(country_name, state_or_prov_name);
|
|
|
|
const uint32_t continent_code = pRec->get_continent_code();
|
|
|
|
uprintf("Country: %s, State/Province: %s (%s), Continent: %s\n", country_name.c_str(), state_or_prov_name.c_str(), pRec->get_state_or_prov().c_str(),
|
|
(continent_code < std::size(g_hatch_continents)) ? g_hatch_continents[continent_code] : "?");
|
|
|
|
uprintf("Latitude/Longitude: %f %f, %s %s\n", pRec->get_latitude(), pRec->get_longitude(), pRec->get_latitude_dms().c_str(), pRec->get_longitude_dms().c_str());
|
|
|
|
const uint32_t locale = pRec->get_locale();
|
|
if (locale < std::size(g_hatch_locales))
|
|
uprintf("Locale: %s\n", g_hatch_locales[locale]);
|
|
|
|
uprintf("UDB Desc: %s\n", db_str.c_str());
|
|
|
|
uprintf("Decoded Desc: %s\n", desc_str.c_str());
|
|
|
|
uint32_t total_flags = 0;
|
|
for (uint32_t f = 0; f < udb_rec::cMaxFlags; f++)
|
|
{
|
|
if (!f) // map
|
|
continue;
|
|
|
|
if (pRec->get_flag(f))
|
|
total_flags++;
|
|
}
|
|
|
|
if (total_flags)
|
|
{
|
|
uprintf("Flags: ");
|
|
|
|
uint32_t num_flags_printed = 0;
|
|
for (uint32_t f = 0; f < udb_rec::cMaxFlags; f++)
|
|
{
|
|
if (!f) // map
|
|
continue;
|
|
|
|
if (pRec->get_flag(f))
|
|
{
|
|
uprintf("%s", g_pHatch_flag_descs[f]);
|
|
|
|
num_flags_printed++;
|
|
if (num_flags_printed < total_flags)
|
|
{
|
|
uprintf(", ");
|
|
|
|
if ((num_flags_printed % 2) == 0)
|
|
uprintf("\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
uprintf("\n");
|
|
}
|
|
|
|
uprintf("Ref: %s\n", pRec->get_full_refs().c_str());
|
|
}
|
|
|
|
output.push_back(string_format("Date: %s\nLocation: \"%s\"\nDescription: \"%s\"\n", date_str.c_str(), loc_str.c_str(), desc_str.c_str()));
|
|
}
|
|
|
|
string_vec toks;
|
|
for (const auto& str : g_unique_tokens)
|
|
toks.push_back(str);
|
|
write_text_file("unique_tokens.txt", toks, false);
|
|
|
|
write_text_file("output.txt", output, true);
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool convert_rec(uint32_t rec_index, const udb_rec* pRec, timeline_event& event)
|
|
{
|
|
std::string db_str, loc_str, desc_str;
|
|
decode_hatch_desc(pRec, db_str, loc_str, desc_str);
|
|
|
|
pRec->get_date(event.m_begin_date);
|
|
|
|
if (event.m_begin_date.m_year <= 0)
|
|
return false;
|
|
|
|
std::string time;
|
|
if (pRec->get_time(time))
|
|
{
|
|
if (time != "00:00?")
|
|
event.m_time_str = time;
|
|
}
|
|
|
|
event.m_date_str = event.m_begin_date.get_string();
|
|
|
|
event.m_locations.push_back(loc_str);
|
|
|
|
event.m_desc = desc_str;
|
|
|
|
// TODO
|
|
event.m_type.push_back("sighting");
|
|
|
|
event.m_source_id = string_format("Hatch_UDB_%u", rec_index);
|
|
event.m_source = "Hatch";
|
|
|
|
for (uint32_t f = 0; f < udb_rec::cMaxFlags; f++)
|
|
if ((f != cFlagMAP) && (pRec->get_flag(f)))
|
|
event.m_attributes.push_back(g_pHatch_flag_descs[f]);
|
|
|
|
event.m_refs.push_back(pRec->get_full_refs());
|
|
|
|
event.m_key_value_data.push_back(std::make_pair("LocationLink", string_format("[Google Maps](https://www.google.com/maps/place/%f,%f)", pRec->get_latitude(), pRec->get_longitude())));
|
|
|
|
event.m_key_value_data.push_back(std::make_pair("LatLong", string_format("%f %f", pRec->get_latitude(), pRec->get_longitude())));
|
|
event.m_key_value_data.push_back(std::make_pair("LatLongDMS", string_format("%s %s", pRec->get_latitude_dms().c_str(), pRec->get_longitude_dms().c_str())));
|
|
|
|
event.m_key_value_data.push_back(std::make_pair("HatchDesc", db_str));
|
|
|
|
event.m_key_value_data.push_back(std::make_pair("Duration", string_format("%u", pRec->get_duration())));
|
|
|
|
std::string country_name, state_or_prov_name;
|
|
pRec->get_geo(country_name, state_or_prov_name);
|
|
|
|
event.m_key_value_data.push_back(std::make_pair("Country", country_name));
|
|
event.m_key_value_data.push_back(std::make_pair("State/Prov", state_or_prov_name));
|
|
|
|
event.m_key_value_data.push_back(std::make_pair("Strangeness", string_format("%u", pRec->get_strangeness())));
|
|
event.m_key_value_data.push_back(std::make_pair("Credibility", string_format("%u", pRec->get_credibility())));
|
|
|
|
const uint32_t locale = pRec->get_locale();
|
|
if (locale < std::size(g_hatch_locales))
|
|
event.m_key_value_data.push_back(std::make_pair("Locale", g_hatch_locales[locale]));
|
|
|
|
if (pRec->get_elevation() != -99)
|
|
event.m_key_value_data.push_back(std::make_pair("Elev", string_format("%i", pRec->get_elevation())));
|
|
|
|
if ((pRec->get_rel_altitude() != 0) && (pRec->get_rel_altitude() != 999))
|
|
event.m_key_value_data.push_back(std::make_pair("RelAlt", string_format("%i", pRec->get_rel_altitude())));
|
|
|
|
return true;
|
|
}
|
|
|
|
bool udb_convert()
|
|
{
|
|
uint8_vec udb;
|
|
if (!read_binary_file("u.rnd", udb))
|
|
return false;
|
|
|
|
const uint32_t TOTAL_RECS = 18123;
|
|
if ((udb.size() / UDB_RECORD_SIZE) < TOTAL_RECS)
|
|
panic("Invalid file size");
|
|
|
|
const udb_rec* pRecs = reinterpret_cast<const udb_rec*>(&udb.front());
|
|
|
|
ufo_timeline timeline;
|
|
|
|
for (uint32_t rec_index = 1; rec_index < TOTAL_RECS; rec_index++)
|
|
{
|
|
const udb_rec* pRec = pRecs + rec_index;
|
|
|
|
timeline_event event;
|
|
if (!convert_rec(rec_index, pRec, event))
|
|
continue;
|
|
|
|
timeline.get_events().push_back(event);
|
|
}
|
|
|
|
if (!timeline.get_events().size())
|
|
panic("Empty timeline)");
|
|
|
|
timeline.set_name("Hatch_UDB_Timeline");
|
|
|
|
return timeline.write_file("hatch_udb.json", true);
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|