2023-02-20 17:59:08 -05:00
// Copyright (C) 2023 Richard Geldreich, Jr.
# include "udb.h"
# include "udb_tables.h"
const uint32_t UDB_RECORD_SIZE = 112 ;
const uint32_t UDB_REC_TEXT_SIZE = 78 ;
enum
{
cFlagMAP , cFlagGND , cFlagCST , cFlagSEA , cFlagAIR , cFlagObsMIL , cFlagObsCIV , cFlagHQO , // loc/obs flags
cFlagSCI , cFlagTLP , cFlagNWS , cFlagMID , cFlagHOX , cFlagCNT , cFlagODD , cFlagWAV , // misc flags
cFlagSCR , cFlagCIG , cFlagDLT , cFlagNLT , cFlagPRB , cFlagFBL , cFlagSUB , cFlagNFO , // type of ufo craft flags
cFlagOID , cFlagRBT , cFlagPSH , cFlagMIB , cFlagMON , cFlagGNT , cFlagFIG , cFlagNOC , // aliens monsters flags
cFlagOBS , cFlagRAY , cFlagSMP , cFlagMST , cFlagABD , cFlagOPR , cFlagSIG , cFlagCVS , // apparent ufo occupant activities flags
cFlagNUC , cFlagDRT , cFlagVEG , cFlagANI , cFlagHUM , cFlagVEH , cFlagBLD , cFlagLND , // places visited and things affected flags
cFlagPHT , cFlagRDR , cFlagRDA , cFlagEME , cFlagTRC , cFlagTCH , cFlagHST , cFlagINJ , // evidence and special effects flags
cFlagMIL , cFlagBBK , cFlagGSA , cFlagOGA , cFlagSND , cFlagODR , cFlagCOV , cFlagCMF , // misc details flags
cTotalFlags = 64
} ;
# pragma pack(push, 1)
struct udb_rec
{
private :
int16_t m_year ;
uint8_t m_unknown_and_locale ; // nibbles
uint8_t m_unknown_and_month ; // nibbles
uint8_t m_ref_index_high_day ; // 3 bits ref index high, low 5 bits day
uint8_t m_time ;
uint8_t m_ymdt ; // 2-bit fields: TDMY accuracy, T lowest, 0=invalid, 1=?, 2=~, 3=accurate
uint8_t m_duration ;
uint8_t m_unknown1 ;
int16_t m_enc_longtitude ;
int16_t m_enc_latitude ;
int16_t m_elevation ;
int16_t m_rel_altitude ;
uint8_t m_unknown2 ;
uint8_t m_continent_country ; // nibbles
uint8_t m_state_or_prov [ 3 ] ;
uint8_t m_unknown3 ;
#if 0
uint8_t m_loc_flags ;
uint8_t m_misc_flags ;
uint8_t m_type_of_ufo_craft_flags ;
uint8_t m_aliens_monsters_flags ;
uint8_t m_apparent_ufo_occupant_activities_flags ;
uint8_t m_places_visited_and_things_affected_flags ;
uint8_t m_evidence_and_special_effects_flags ;
uint8_t m_miscellaneous_details_flags ;
# else
uint8_t m_flags [ 8 ] ;
# endif
uint8_t m_text [ UDB_REC_TEXT_SIZE ] ;
uint8_t m_reference ;
uint8_t m_ref_index ;
uint8_t m_strangeness_credibility ; // nibbles
public :
const uint8_t * get_text ( ) const { return m_text ; }
int get_year ( ) const { return m_year ; }
uint32_t get_month ( ) const { return m_unknown_and_month & 0xF ; }
uint32_t get_day ( ) const { return m_ref_index_high_day & 31 ; }
// meters
int get_elevation ( ) const { return m_elevation ; }
int get_rel_altitude ( ) const { return m_rel_altitude ; }
uint32_t get_strangeness ( ) const { return m_strangeness_credibility > > 4 ; }
uint32_t get_credibility ( ) const { return m_strangeness_credibility & 0xF ; }
uint32_t get_reference ( ) const { return m_reference ; }
uint32_t get_reference_index ( ) const { return m_ref_index | ( ( m_ref_index_high_day > > 5 ) < < 8 ) ; }
uint32_t get_continent_code ( ) const { return m_continent_country > > 4 ; }
uint32_t get_country_code ( ) const { return m_continent_country & 0xF ; }
uint32_t get_locale ( ) const { return m_unknown_and_locale & 0xF ; }
std : : string get_state_or_prov ( ) const
{
const uint32_t c0 = m_state_or_prov [ 0 ] ;
const uint32_t c1 = m_state_or_prov [ 1 ] ;
const uint32_t c2 = m_state_or_prov [ 2 ] ;
return dos_to_utf8 ( string_format ( " %c%c%c " , ( c0 > = ' ' ) ? c0 : ' ' , ( c1 > = ' ' ) ? c1 : ' ' , ( c2 > = ' ' ) ? c2 : ' ' ) ) ;
}
double get_latitude ( ) const { return ( ( double ) m_enc_latitude / 200.0f ) * 1.11111111111f ; }
double get_longitude ( ) const { return - ( ( double ) m_enc_longtitude / 200.0f ) * 1.11111111111f ; }
std : : string get_latitude_dms ( ) const { double lat = get_latitude ( ) ; return get_deg_to_dms ( lat ) + ( ( lat < = 0 ) ? " S " : " N " ) ; }
std : : string get_longitude_dms ( ) const { double lon = get_longitude ( ) ; return get_deg_to_dms ( lon ) + ( ( lon < = 0 ) ? " W " : " E " ) ; }
// minutes
uint32_t get_duration ( ) const { return m_duration ; }
enum
{
cAccuracyInvalid = 0 ,
cAccuracyQuestionable = 1 ,
cAccuracyApproximate = 2 ,
cAccuracyGood = 3
} ;
bool get_time ( std : : string & time ) const
{
uint32_t time_accuracy = m_ymdt & 3 ;
if ( time_accuracy = = cAccuracyInvalid )
return false ;
uint32_t hour = m_time / 6 ;
uint32_t minute = ( m_time % 6 ) * 10 ;
if ( hour > 23 )
{
assert ( 0 ) ;
return false ;
}
time = string_format ( " %02u:%02u " , hour , minute ) ;
if ( time_accuracy = = cAccuracyQuestionable )
time + = " ? " ;
else if ( time_accuracy = = cAccuracyApproximate )
time = " ~ " + time ;
return true ;
}
bool get_date ( event_date & date ) const
{
uint32_t year_accuracy = ( m_ymdt > > 6 ) & 3 ;
uint32_t month_accuracy = ( m_ymdt > > 4 ) & 3 ;
uint32_t day_accuracy = ( m_ymdt > > 2 ) & 3 ;
int year = year_accuracy ? get_year ( ) : 0 ;
uint32_t month = month_accuracy ? get_month ( ) : 0 ;
uint32_t day = day_accuracy ? get_day ( ) : 0 ;
if ( ( day < 1 ) | | ( day > 31 ) )
{
day = 0 ;
day_accuracy = cAccuracyInvalid ;
}
if ( ( month < 1 ) | | ( month > 12 ) )
{
month = 0 ;
month_accuracy = cAccuracyInvalid ;
}
if ( ! year )
return false ;
uint32_t min_accuracy = year ;
date . m_year = year ;
if ( month )
{
date . m_month = month ;
if ( ! day )
{
min_accuracy = std : : min ( year_accuracy , month_accuracy ) ;
}
else
{
min_accuracy = std : : min ( std : : min ( year_accuracy , month_accuracy ) , day_accuracy ) ;
date . m_day = day ;
}
}
if ( min_accuracy = = cAccuracyApproximate )
date . m_approx = true ;
else if ( min_accuracy = = cAccuracyQuestionable )
date . m_fuzzy = true ;
return true ;
}
enum { cMaxFlags = 64 } ;
// LOC, MISC, TYPE, ALIENS/MONSTERS, ACTIVITIES, VISITED/THINGS, EVIDENCE/SPECIAL, MISC_DETAILS
bool get_flag ( uint32_t index ) const
{
assert ( index < cMaxFlags ) ;
return ( m_flags [ index > > 3 ] & ( 1 < < ( index & 7 ) ) ) ! = 0 ;
}
#if 0
uint8_t get_loc_flags ( ) const { return m_loc_flags ; }
uint8_t get_misc_flags ( ) const { return m_misc_flags ; }
uint8_t get_type_of_ufo_craft_flags ( ) const { return m_type_of_ufo_craft_flags ; }
uint8_t get_aliens_monsters_flags ( ) const { return m_aliens_monsters_flags ; }
uint8_t get_apparent_ufo_occupant_activities_flags ( ) const { return m_apparent_ufo_occupant_activities_flags ; }
uint8_t get_places_visited_and_things_affected_flags ( ) const { return m_places_visited_and_things_affected_flags ; }
uint8_t get_evidence_and_special_effects_flags ( ) const { return m_evidence_and_special_effects_flags ; }
uint8_t get_miscellaneous_details_flags ( ) const { return m_miscellaneous_details_flags ; }
# endif
void get_geo ( std : : string & country_name , std : : string & state_or_prov_name ) const
{
std : : string state_or_prov_str ( get_state_or_prov ( ) ) ;
string_trim_end ( state_or_prov_str ) ;
if ( state_or_prov_str . back ( ) = = ' . ' )
state_or_prov_str . pop_back ( ) ;
if ( state_or_prov_str . back ( ) = = ' . ' )
state_or_prov_str . pop_back ( ) ;
get_hatch_geo ( get_continent_code ( ) , get_country_code ( ) , state_or_prov_str , country_name , state_or_prov_name ) ;
if ( state_or_prov_str = = " UNK " )
state_or_prov_name = " Unknown " ;
}
std : : string get_full_refs ( ) const
{
std : : string ref ( g_hatch_refs_tab [ get_reference ( ) ] ) ;
if ( g_hatch_refs_tab [ get_reference ( ) ] )
{
uint32_t ref_index = get_reference_index ( ) ;
if ( get_reference ( ) = = 93 )
{
for ( const auto & x : g_hatch_refs_93 )
if ( x . m_ref = = ref_index )
{
ref + = x . m_pDesc ;
break ;
}
}
else if ( get_reference ( ) = = 96 )
{
for ( const auto & x : g_hatch_refs_96 )
if ( x . m_ref = = ref_index )
{
ref + = x . m_pDesc ;
break ;
}
}
else if ( get_reference ( ) = = 97 )
{
for ( const auto & x : g_hatch_refs_97 )
if ( x . m_ref = = ref_index )
{
ref + = x . m_pDesc ;
break ;
}
}
else if ( get_reference ( ) = = 98 )
{
for ( const auto & x : g_hatch_refs_98 )
if ( x . m_ref = = ref_index )
{
ref + = x . m_pDesc ;
break ;
}
}
else
{
ref + = string_format ( " (Index %u) " , ref_index ) ;
}
}
return ref ;
}
} ;
# pragma pack(pop)
static std : : unordered_map < std : : string , std : : string > g_dictionary ;
struct token
{
std : : string m_token ;
bool m_cap_check ;
bool m_replaced_flag ;
token ( ) :
m_cap_check ( false ) ,
m_replaced_flag ( false )
{
}
token ( const std : : string & token , bool cap_check , bool replaced_flag ) :
m_token ( token ) ,
m_cap_check ( cap_check ) ,
m_replaced_flag ( replaced_flag )
{
}
} ;
std : : unordered_set < std : : string > g_unique_tokens ;
std : : vector < string_vec > g_hatch_exception_tokens ;
static void init_hatch_cap_exception_tokens ( )
{
g_hatch_exception_tokens . resize ( std : : size ( g_cap_exceptions ) ) ;
std : : string cur_etoken ;
for ( uint32_t e = 0 ; e < std : : size ( g_cap_exceptions ) ; e + + )
{
const std : : string exception_str ( g_cap_exceptions [ e ] ) ;
string_vec & etokens = g_hatch_exception_tokens [ e ] ;
for ( uint32_t i = 0 ; i < exception_str . size ( ) ; i + + )
{
uint8_t c = exception_str [ i ] ;
if ( c = = ' ' )
{
if ( cur_etoken . size ( ) )
{
etokens . push_back ( cur_etoken ) ;
cur_etoken . clear ( ) ;
}
}
else if ( c = = ' - ' )
{
if ( cur_etoken . size ( ) )
{
etokens . push_back ( cur_etoken ) ;
cur_etoken . clear ( ) ;
}
std : : string s ;
s . push_back ( c ) ;
etokens . push_back ( s ) ;
}
else
{
cur_etoken . push_back ( c ) ;
}
}
if ( cur_etoken . size ( ) )
{
etokens . push_back ( cur_etoken ) ;
cur_etoken . resize ( 0 ) ;
}
}
}
static std : : string fix_capitilization ( std : : vector < token > & toks , uint32_t & tok_index )
{
if ( toks [ tok_index ] . m_replaced_flag )
return toks [ tok_index ] . m_token ;
const uint32_t toks_remaining = ( uint32_t ) toks . size ( ) - tok_index ;
// Peak ahead on the tokens to see if we need to correct any capitilization using the exception table.
for ( uint32_t e = 0 ; e < std : : size ( g_cap_exceptions ) ; e + + )
{
const string_vec & etokens = g_hatch_exception_tokens [ e ] ;
if ( toks_remaining > = etokens . size ( ) )
{
uint32_t i ;
for ( i = 0 ; i < etokens . size ( ) ; i + + )
if ( ( string_icompare ( etokens [ i ] , toks [ tok_index + i ] . m_token . c_str ( ) ) ! = 0 ) | | toks [ tok_index + i ] . m_replaced_flag )
break ;
if ( i = = etokens . size ( ) )
{
for ( i = 0 ; i < etokens . size ( ) ; i + + )
{
toks [ tok_index + i ] . m_token = etokens [ i ] ;
toks [ tok_index + i ] . m_replaced_flag = true ;
}
std : : string res ( toks [ tok_index ] . m_token ) ;
return res ;
}
}
}
std : : string str ( toks [ tok_index ] . m_token ) ;
if ( ! toks [ tok_index ] . m_cap_check )
return str ;
string_vec wtokens ;
std : : string cur_wtoken ;
for ( uint32_t i = 0 ; i < str . size ( ) ; i + + )
{
uint8_t c = str [ i ] ;
if ( isalpha ( c ) | | isdigit ( c ) | | ( ( c = = ' \' ' ) & & ( i ! = 0 ) & & ( i ! = str . size ( ) - 1 ) ) )
{
cur_wtoken . push_back ( c ) ;
}
else
{
if ( cur_wtoken . size ( ) )
{
wtokens . push_back ( cur_wtoken ) ;
cur_wtoken . clear ( ) ;
}
std : : string s ;
s . push_back ( c ) ;
wtokens . push_back ( s ) ;
}
}
if ( cur_wtoken . size ( ) )
{
wtokens . push_back ( cur_wtoken ) ;
cur_wtoken . clear ( ) ;
}
for ( uint32_t wtoken_index = 0 ; wtoken_index < wtokens . size ( ) ; wtoken_index + + )
{
std : : string & substr = wtokens [ wtoken_index ] ;
if ( substr = = " A " )
substr = " a " ;
else if ( substr . size ( ) > = 2 )
{
bool is_all_uppercase = true ;
for ( uint8_t c : substr )
{
if ( ! isupper ( c ) & & ( c ! = ' \' ' ) )
{
is_all_uppercase = false ;
break ;
}
}
if ( is_all_uppercase )
{
auto res = g_dictionary . find ( string_lower ( substr ) ) ;
if ( res ! = g_dictionary . end ( ) )
{
substr = res - > second ;
}
else
{
substr = string_lower ( substr ) ;
g_unique_tokens . insert ( substr ) ;
}
}
}
}
std : : string res ;
for ( uint32_t wtoken_index = 0 ; wtoken_index < wtokens . size ( ) ; wtoken_index + + )
res + = wtokens [ wtoken_index ] ;
return res ;
}
static std : : unordered_map < std : : string , hatch_abbrev > g_hatch_abbreviations_map ;
static void init_hatch_abbreviations_map ( )
{
for ( uint32_t abbrev_index = 0 ; abbrev_index < std : : size ( g_hatch_abbreviations ) ; abbrev_index + + )
{
auto res = g_hatch_abbreviations_map . insert ( std : : make_pair ( string_lower ( g_hatch_abbreviations [ abbrev_index ] . pAbbrev ) , g_hatch_abbreviations [ abbrev_index ] ) ) ;
if ( ! res . second )
panic ( " Mutiple Hatch abbreviation: %s " , res . first - > first . c_str ( ) ) ;
}
}
// Expand abbreviations
static void expand_abbreviations_internal ( bool first_line , std : : string orig_token , const string_vec & tokens , uint32_t cur_tokens_index , std : : vector < token > & toks )
{
const uint32_t MAX_ABBREVS = 5 ;
uint32_t k ;
for ( k = 0 ; k < MAX_ABBREVS ; k + + )
{
std : : string new_token ( orig_token ) ;
auto find_res = g_hatch_abbreviations_map . find ( string_lower ( orig_token ) ) ;
if ( find_res ! = g_hatch_abbreviations_map . end ( ) )
{
if ( ! first_line | | ! find_res - > second . m_forbid_firstline )
{
new_token = find_res - > second . pExpansion ;
if ( new_token . size ( ) )
toks . push_back ( token ( new_token , ! first_line & & ( new_token = = orig_token ) , false ) ) ;
break ;
}
}
if ( ( orig_token . size ( ) > = 4 ) & & ( uisupper ( orig_token [ 0 ] ) ) )
{
std : : string month_suffix ( orig_token ) ;
month_suffix . erase ( 0 , 3 ) ;
if ( ( month_suffix . size ( ) < = 4 ) & & string_is_digits ( month_suffix ) )
{
std : : string month_prefix ( orig_token ) ;
month_prefix . erase ( 3 , month_prefix . size ( ) - 3 ) ;
std : : string search_prefix ( string_upper ( month_prefix ) ) ;
static const char * g_hmonths [ 12 ] =
{
" JAN " , " FEB " , " MAR " , " APR " , " MAY " , " JUN " ,
" JLY " , " AUG " , " SEP " , " OCT " , " NOV " , " DEC "
} ;
uint32_t m ;
for ( m = 0 ; m < 12 ; m + + )
if ( search_prefix = = g_hmonths [ m ] )
break ;
if ( m < 12 )
{
toks . push_back ( token ( g_months [ m ] , ! first_line , false ) ) ;
// TODO: This can be improved by checking the # before the token
long long val = atoll ( month_suffix . c_str ( ) ) ;
if ( val > 31 )
month_suffix = ' \' ' + month_suffix ;
toks . push_back ( token ( month_suffix , ! first_line , false ) ) ;
break ;
}
}
}
size_t p ;
if ( ( p = orig_token . find_first_of ( ' . ' ) ) = = std : : string : : npos )
{
// No period(s) - we're done.
if ( new_token . size ( ) )
toks . push_back ( token ( new_token , ! first_line , false ) ) ;
break ;
}
// Specifically detect abbrev. first names like "A." etc. and expand them.
if ( ! first_line & & ( orig_token . size ( ) > 4 ) & & ( p = = 1 ) & & uisupper ( orig_token [ 0 ] ) & & uisupper ( orig_token [ 2 ] ) )
{
std : : string first_name ( orig_token ) ;
first_name . erase ( 2 , first_name . size ( ) - 2 ) ;
toks . push_back ( token ( first_name , false , false ) ) ;
orig_token . erase ( 0 , p + 1 ) ;
}
else
{
// Detect words starting with an abbreviation ending in "."
std : : string prefix ( orig_token ) ;
prefix . erase ( p + 1 , prefix . size ( ) - ( p + 1 ) ) ;
find_res = g_hatch_abbreviations_map . find ( string_lower ( prefix ) ) ;
if ( ( find_res ! = g_hatch_abbreviations_map . end ( ) ) & & ( ! first_line | | ! find_res - > second . m_forbid_firstline ) )
{
new_token = find_res - > second . pExpansion ;
toks . push_back ( token ( new_token , false , false ) ) ;
orig_token . erase ( 0 , p + 1 ) ;
}
else
{
if ( new_token . size ( ) )
toks . push_back ( token ( new_token , ! first_line , false ) ) ;
break ;
}
}
} // k
if ( k = = MAX_ABBREVS )
{
if ( orig_token . size ( ) )
toks . push_back ( token ( orig_token , ! first_line , false ) ) ;
}
}
static bool is_sentence_ender ( uint8_t c )
{
return ( c = = ' ! ' ) | | ( c = = ' . ' ) | | ( c = = ' ? ' ) ;
}
static void expand_abbreviations ( bool first_line , std : : string orig_token , const string_vec & tokens , uint32_t cur_tokens_index , std : : vector < token > & toks )
{
std : : string new_token ( orig_token ) ;
// Temporarily remove " and ' prefix/suffix chars from the token, before the abbrev checks.
std : : string prefix_char , suffix_char ;
if ( orig_token . size ( ) > = 3 )
{
if ( ( orig_token [ 0 ] = = ' \' ' ) | | ( orig_token [ 0 ] = = ' \" ' ) )
{
prefix_char . push_back ( orig_token [ 0 ] ) ;
orig_token . erase ( 0 , 1 ) ;
new_token = orig_token ;
}
if ( ( orig_token . back ( ) = = ' \' ' ) | | ( orig_token . back ( ) = = ' \" ' ) )
{
suffix_char . push_back ( orig_token . back ( ) ) ;
orig_token . pop_back ( ) ;
new_token = orig_token ;
}
}
const size_t first_tok = toks . size ( ) ;
expand_abbreviations_internal ( first_line , orig_token , tokens , cur_tokens_index , toks ) ;
const size_t num_toks = toks . size ( ) - first_tok ;
assert ( num_toks ) ;
const size_t last_tok = first_tok + num_toks - 1 ;
if ( prefix_char . size ( ) )
toks [ first_tok ] . m_token = prefix_char + toks [ first_tok ] . m_token ;
if ( suffix_char . size ( ) )
toks [ last_tok ] . m_token = toks [ last_tok ] . m_token + suffix_char ;
}
static std : : string decode_hatch ( const std : : string & str , bool first_line )
{
std : : string res ;
string_vec tokens ;
std : : string cur_token ;
bool inside_space = false ;
int prev_c = - 1 ;
// Phase 1: Tokenize the input string based off examination of (mostly) individual chars, previous chars and upcoming individual chars.
for ( uint32_t i = 0 ; i < str . size ( ) ; i + + )
{
uint8_t c = str [ i ] ;
const bool is_two_dots = ( c = = ' . ' ) & & ( ( i + 1 ) < str . size ( ) ) & & ( str [ i + 1 ] = = ' . ' ) ;
const bool is_one_equals = ( c = = ' 1 ' ) & & ( ( i + 1 ) < str . size ( ) ) & & ( str [ i + 1 ] = = ' = ' ) ;
const bool prev_is_digit = i & & uisdigit ( str [ i - 1 ] ) ;
const bool next_is_plus = ( ( i + 1 ) < str . size ( ) ) & & ( str [ i + 1 ] = = ' + ' ) ;
//const bool has_prev = (i != 0);
//const bool has_next = (i + 1) < str.size();
if ( c = = ' ' )
{
if ( cur_token . size ( ) )
{
tokens . push_back ( cur_token ) ;
cur_token . clear ( ) ;
}
inside_space = true ;
}
else if ( is_one_equals )
{
if ( cur_token . size ( ) )
{
tokens . push_back ( cur_token ) ;
cur_token . clear ( ) ;
}
tokens . push_back ( " 1= " ) ;
i + + ;
inside_space = false ;
}
else if (
( c = = ' ; ' ) | | ( ( c > = 0x18 ) & & ( c < = 0x1b ) ) | | ( c = = ' < ' ) | | ( c = = ' > ' ) | |
( c = = ' = ' ) | |
( c = = ' / ' ) | |
( c = = ' , ' ) | |
( c = = ' ? ' ) | | ( c = = ' ! ' ) | |
( ( ! prev_is_digit | | next_is_plus ) & & ( c = = ' + ' ) ) | |
( c = = ' @ ' ) | | ( c = = ' - ' ) | |
is_two_dots
)
{
if ( cur_token . size ( ) )
{
tokens . push_back ( cur_token ) ;
cur_token . clear ( ) ;
}
std : : string s ;
s . push_back ( c ) ;
if ( is_two_dots )
{
s + = " . " ;
i + + ;
}
tokens . push_back ( s ) ;
inside_space = false ;
}
else
{
cur_token . push_back ( c ) ;
inside_space = false ;
if ( ( c = = 0xf8 ) | | // code page 437 degree sym
( prev_is_digit & & ( c = = ' + ' ) & & ! next_is_plus ) )
{
tokens . push_back ( cur_token ) ;
cur_token . clear ( ) ;
}
}
prev_c = c ;
}
if ( cur_token . size ( ) )
tokens . push_back ( cur_token ) ;
// Phase 2: Exceptional fixups that change or split tokens up into multiple tokens.
string_vec new_tokens ;
for ( uint32_t i = 0 ; i < tokens . size ( ) ; i + + )
{
std : : string tok ( tokens [ i ] ) ;
// Convert "BBK#"
if ( string_begins_with ( tok , " BBK# " ) & & ( tok . size ( ) > 4 ) )
{
new_tokens . push_back ( " Project Bluebook Case # " ) ;
tok . erase ( 0 , 4 ) ;
new_tokens . push_back ( tok ) ;
continue ;
}
// Split "k'alt"
if ( string_ends_in ( tok , " k'alt " ) )
{
tok . erase ( tok . size ( ) - 3 , 3 ) ;
new_tokens . push_back ( tok ) ;
new_tokens . push_back ( " Alt " ) ;
continue ;
}
// Convert "HI+LO"
if ( ( i + 2 < tokens . size ( ) ) & & ( tokens [ i ] = = " HI " ) & & ( tokens [ i + 1 ] = = " + " ) & & ( tokens [ i + 2 ] = = " LO " ) )
{
tokens . push_back ( " high and low " ) ;
i + = 2 ;
continue ;
}
// Don't split "4rth" to "4 rth" etc.
if ( ( string_icompare ( tok , " 4RTH " ) = = 0 ) | | ( string_icompare ( tok , " 3rds " ) = = 0 ) | | ( string_icompare ( tok , " 16th " ) = = 0 ) )
{
new_tokens . push_back ( tok ) ;
continue ;
}
if ( string_ends_in ( tok , " Kmph " ) )
{
new_tokens . push_back ( tok ) ;
continue ;
}
if ( tok = = " 12Ocm " )
{
new_tokens . push_back ( " 120cm " ) ;
continue ;
}
if ( string_icompare ( tok , " 3OOM " ) = = 0 )
{
new_tokens . push_back ( " 300m " ) ;
continue ;
}
// If the first char isn't a digit then just continue now, because the rest of this code is concerned with splitting numbers away from words.
if ( ! isdigit ( tok [ 0 ] ) )
{
new_tokens . push_back ( tok ) ;
continue ;
}
if ( tok . size ( ) > = 3 )
{
// Check for 1-7 digits then ' followed by 1- letters and split
uint32_t j ;
for ( j = 1 ; j < tok . size ( ) ; j + + )
if ( tok [ j ] = = ' \' ' )
break ;
if ( ( j < tok . size ( ) ) & & ( j ! = tok . size ( ) - 1 ) & & ( j < = 7 ) )
{
uint32_t k ;
for ( k = 1 ; k < j ; k + + )
if ( ! uisdigit ( tok [ k ] ) & & ( utolower ( tok [ k ] ) ! = ' x ' ) & & ( utolower ( tok [ k ] ) ! = ' k ' ) & & ( tok [ k ] ! = ' . ' ) )
break ;
if ( ( k = = j ) & & ( uisalpha ( tok [ j + 1 ] ) ) )
{
int sp = j + 1 ;
std : : string new_tok ( tok ) ;
new_tok . erase ( 0 , sp ) ;
std : : string n ( tok ) ;
n . erase ( sp , n . size ( ) - sp ) ;
new_tokens . push_back ( n ) ;
new_tokens . push_back ( new_tok ) ;
continue ;
}
}
}
// Won't split digits away for tokens < 4 chars
if ( ( tok . size ( ) < 4 ) | | ( tok = = " 6F6s " ) )
{
new_tokens . push_back ( tok ) ;
continue ;
}
// Check for 1-2 digits and alpha and split
// TODO: support 3-4 digits
int split_point = - 1 ;
if ( uisalpha ( tok [ 1 ] ) )
split_point = 1 ;
else if ( uisdigit ( tok [ 1 ] ) & & uisalpha ( tok [ 2 ] ) & & uisalpha ( tok [ 3 ] ) )
split_point = 2 ;
if ( split_point > 0 )
{
std : : string new_tok ( tok ) ;
new_tok . erase ( 0 , split_point ) ;
// Don't split the number digits from some special cases, like hr, cm, mph, etc.
if ( ( string_icompare ( new_tok , " hr " ) ! = 0 ) & &
( string_icompare ( new_tok , " nd " ) ! = 0 ) & &
( string_icompare ( new_tok , " kw " ) ! = 0 ) & &
( string_icompare ( new_tok , " cm " ) ! = 0 ) & &
( string_icompare ( new_tok , " km " ) ! = 0 ) & &
( string_icompare ( new_tok , " mph " ) ! = 0 ) & &
( string_icompare ( new_tok , " kph " ) ! = 0 ) & &
( ! string_begins_with ( new_tok , " K' " ) ) )
{
std : : string n ( tok ) ;
n . erase ( split_point , n . size ( ) - split_point ) ;
new_tokens . push_back ( n ) ;
if ( new_tok = = " min " )
new_tok = " minute(s) " ;
new_tokens . push_back ( new_tok ) ;
}
else
{
new_tokens . push_back ( tok ) ;
}
}
else
{
new_tokens . push_back ( tok ) ;
}
}
tokens . swap ( new_tokens ) ;
std : : vector < token > toks ;
// Phase 3: Compose new string, expanding abbreviations and tokens to one or more words, or combining together special sequences of tokens into specific phrases.
// Also try to carefully insert spaces into the output, as needed.
for ( uint32_t i = 0 ; i < tokens . size ( ) ; i + + )
{
const uint32_t num_tokens_left = ( ( uint32_t ) tokens . size ( ) - 1 ) - i ;
const bool has_prev_token = i > 0 , has_next_token = ( i + 1 ) < tokens . size ( ) ;
const bool next_token_is_slash = ( has_next_token ) & & ( tokens [ i + 1 ] [ 0 ] = = ' / ' ) ;
bool is_next_dir = false ;
if ( has_next_token )
{
uint32_t ofs = 1 ;
if ( tokens [ i + 1 ] = = " > " )
{
ofs = 2 ;
}
if ( ( i + ofs ) < tokens . size ( ) )
{
std : : string next_tok = string_upper ( tokens [ i + ofs ] ) ;
if ( ( next_tok . back ( ) = = ' . ' ) & & ( next_tok . size ( ) > = 2 ) )
next_tok . pop_back ( ) ;
if ( ( next_tok = = " N " ) | | ( next_tok = = " S " ) | | ( next_tok = = " E " ) | | ( next_tok = = " W " ) | |
( next_tok = = " SW " ) | | ( next_tok = = " SE " ) | | ( next_tok = = " NW " ) | | ( next_tok = = " NE " ) | |
( next_tok = = " NNE " ) | | ( next_tok = = " NNW " ) | | ( next_tok = = " SSE " ) | | ( next_tok = = " SSW " ) | |
( next_tok = = " ESE " ) )
{
is_next_dir = true ;
}
}
}
std : : string orig_token ( tokens [ i ] ) ;
std : : string new_token ( orig_token ) ;
if ( ! orig_token . size ( ) )
continue ;
// Handle various exceptions before expending abbreviations
// TODO: Refactor to table(s)
// Special handling for RUSS/RUSS.
if ( ( tokens [ i ] = = " RUSS " ) | | ( tokens [ i ] = = " RUSS. " ) | | ( tokens [ i ] = = " RUS " ) | | ( tokens [ i ] = = " RUS. " ) )
{
if ( first_line )
new_token = " Russia " ;
else
new_token = " Russian " ;
}
// AA FLITE #519 - exception
// AA LINER
else if ( ( tokens [ i ] = = " AA " ) & & ( num_tokens_left > = 1 ) & & ( ( tokens [ i + 1 ] = = " FLITE#519 " ) | | ( tokens [ i + 1 ] = = " LINER " ) ) )
{
new_token = " AA " ;
}
// bright Lt.
else if ( ( tokens [ i ] = = " VBRITE " ) & & ( num_tokens_left > = 1 ) & & ( tokens [ i + 1 ] = = " LT " ) )
{
new_token = " vibrant bright light " ;
i + + ;
}
// ENERGY SRC
else if ( ( tokens [ i ] = = " ENERGY " ) & & ( num_tokens_left > = 1 ) & & ( tokens [ i + 1 ] = = " SRC " ) )
{
new_token = " energy source " ;
i + + ;
}
// mid air - exception
else if ( ( tokens [ i ] = = " MID " ) & & ( num_tokens_left > = 1 ) & & ( tokens [ i + 1 ] = = " AIR " ) )
{
new_token = " mid " ;
}
// /FORMN or /formation - exception
else if ( ( string_icompare ( tokens [ i ] , " / " ) = = 0 ) & & ( num_tokens_left > = 1 ) & & ( ( string_icompare ( tokens [ i + 1 ] , " FORMN " ) = = 0 ) | | ( string_icompare ( tokens [ i + 1 ] , " formation " ) = = 0 ) ) )
{
new_token = " in formation " ;
i + + ;
}
// /FORMNs - exception
else if ( ( string_icompare ( tokens [ i ] , " / " ) = = 0 ) & & ( num_tokens_left > = 1 ) & & ( ( string_icompare ( tokens [ i + 1 ] , " FORMNs " ) = = 0 ) | | ( string_icompare ( tokens [ i + 1 ] , " formations " ) = = 0 ) ) )
{
new_token = " in formations " ;
i + + ;
}
// LOST/CLOUDS - exception
else if ( ( string_icompare ( tokens [ i ] , " LOST " ) = = 0 ) & & ( num_tokens_left > = 2 ) & & ( tokens [ i + 1 ] = = " / " ) & & ( string_icompare ( tokens [ i + 2 ] , " CLOUDS " ) = = 0 ) )
{
new_token = " lost in clouds " ;
i + = 2 ;
}
// LOST/DISTANCE - exception
else if ( ( string_icompare ( tokens [ i ] , " LOST " ) = = 0 ) & & ( num_tokens_left > = 2 ) & & ( tokens [ i + 1 ] = = " / " ) & & ( string_icompare ( tokens [ i + 2 ] , " DISTANCE " ) = = 0 ) )
{
new_token = " lost in the distance " ;
i + = 2 ;
}
// W-carbide - exception
else if ( ( string_icompare ( tokens [ i ] , " W " ) = = 0 ) & & ( num_tokens_left > = 2 ) & & ( tokens [ i + 1 ] = = " - " ) & & ( string_icompare ( tokens [ i + 2 ] , " carbide " ) = = 0 ) )
{
new_token = " W " ;
}
2023-02-24 14:23:55 -05:00
// S-SHAPE - exception
else if ( ( tokens [ i ] = = " S " ) & & ( num_tokens_left > = 2 ) & & ( tokens [ i + 1 ] = = " - " ) & & ( tokens [ i + 2 ] = = " SHAPE " ) )
{
new_token = " S " ;
}
2023-02-20 17:59:08 -05:00
// mid-sky - exception
else if ( ( tokens [ i ] = = " MID " ) & & ( num_tokens_left > = 2 ) & & ( tokens [ i + 1 ] = = " - " ) & & ( tokens [ i + 2 ] = = " SKY " ) )
{
new_token = " mid " ;
}
// mid-flite - exception
else if ( ( tokens [ i ] = = " MID " ) & & ( num_tokens_left > = 2 ) & & ( tokens [ i + 1 ] = = " - " ) & & ( tokens [ i + 2 ] = = " FLITE " ) )
{
new_token = " mid " ;
}
// mid-city - exception
else if ( ( tokens [ i ] = = " MID " ) & & ( num_tokens_left > = 2 ) & & ( tokens [ i + 1 ] = = " - " ) & & ( tokens [ i + 2 ] = = " CITY " ) )
{
new_token = " mid " ;
}
// W vee - exception
else if ( ( tokens [ i ] = = " W " ) & & ( num_tokens_left > = 1 ) & & ( tokens [ i + 1 ] = = " VEE " ) )
{
new_token = " with vee " ;
i + + ;
}
// Lake Mi - exception
else if ( ( tokens [ i ] = = " LAKE " ) & & ( num_tokens_left > = 1 ) & & ( tokens [ i + 1 ] = = " Mi " ) )
{
new_token = " Lake Michigan " ;
i + + ;
}
// SCI-FI
else if ( ( tokens [ i ] = = " SCI " ) & & ( num_tokens_left > = 2 ) & & ( tokens [ i + 1 ] = = " - " ) & & ( tokens [ i + 2 ] = = " FI " ) )
{
new_token = " Sci-Fi " ;
i + = 2 ;
}
// V-tall
else if ( ( tokens [ i ] = = " V " ) & & ( num_tokens_left > = 2 ) & & ( tokens [ i + 1 ] = = " - " ) & & ( tokens [ i + 2 ] = = " TALL " ) )
{
new_token = " very tall " ;
i + = 2 ;
}
// 1 OBS/1 OBS. at beginning
else if ( ( i = = 1 ) & & ( tokens [ 0 ] = = " 1 " ) & & ( tokens [ 1 ] = = " OBS " | | tokens [ 1 ] = = " OBS. " ) )
{
new_token = " observer " ;
}
// CLR WEATHER exception
else if ( ( num_tokens_left > = 1 ) & & ( tokens [ i ] = = " CLR " ) & & ( tokens [ i + 1 ] = = " WEATHER " ) )
{
new_token = " clear " ;
}
// WATER DOMES exception (typo fix)
else if ( ( num_tokens_left > = 1 ) & & ( string_icompare ( tokens [ i ] , " WATER " ) = = 0 ) & & ( string_icompare ( tokens [ i + 1 ] , " DOMES " ) = = 0 ) )
{
new_token = " water comes " ;
i + + ;
}
// W dome exception
else if ( ( num_tokens_left > = 1 ) & & ( string_icompare ( tokens [ i ] , " W " ) = = 0 ) & & ( string_icompare ( tokens [ i + 1 ] , " DOME " ) = = 0 ) )
{
new_token = " with " ;
}
// CLR SKY exception
else if ( ( num_tokens_left > = 1 ) & & ( string_icompare ( tokens [ i ] , " CLR " ) = = 0 ) & & ( string_icompare ( tokens [ i + 1 ] , " SKY " ) = = 0 ) )
{
new_token = " clear " ;
}
// CLR DOME exception
else if ( ( num_tokens_left > = 1 ) & & ( string_icompare ( tokens [ i ] , " CLR " ) = = 0 ) & & ( string_icompare ( tokens [ i + 1 ] , " DOME " ) = = 0 ) )
{
new_token = " clear " ;
}
// CLR DOMED exception
else if ( ( num_tokens_left > = 2 ) & & ( string_icompare ( tokens [ i ] , " CLR " ) = = 0 ) & & ( tokens [ i + 1 ] = = " - " ) & & ( string_icompare ( tokens [ i + 2 ] , " DOMED " ) = = 0 ) )
{
new_token = " clear " ;
}
// CLR DOME exception
else if ( ( num_tokens_left > = 2 ) & & ( string_icompare ( tokens [ i ] , " CLR " ) = = 0 ) & & ( tokens [ i + 1 ] = = " - " ) & & ( string_icompare ( tokens [ i + 2 ] , " DOME " ) = = 0 ) )
{
new_token = " clear " ;
}
// CLR RDR exception
else if ( ( num_tokens_left > = 1 ) & & ( string_icompare ( tokens [ i ] , " CLR " ) = = 0 ) & & ( string_icompare ( tokens [ i + 1 ] , " RDR " ) = = 0 ) )
{
new_token = " clear " ;
}
// CLR CLOCKPIT exception
else if ( ( num_tokens_left > = 1 ) & & ( string_icompare ( tokens [ i ] , " CLR " ) = = 0 ) & & ( string_icompare ( tokens [ i + 1 ] , " COCKPIT " ) = = 0 ) )
{
new_token = " clear " ;
}
// CLR TORUS exception
else if ( ( num_tokens_left > = 1 ) & & ( string_icompare ( tokens [ i ] , " CLR " ) = = 0 ) & & ( string_icompare ( tokens [ i + 1 ] , " TORUS " ) = = 0 ) )
{
new_token = " clear " ;
}
// CLR DAY exception
else if ( ( num_tokens_left > = 1 ) & & ( string_icompare ( tokens [ i ] , " CLR " ) = = 0 ) & & ( string_icompare ( tokens [ i + 1 ] , " DAY " ) = = 0 ) )
{
new_token = " clear " ;
}
// CLR PLASTIC exception
else if ( ( num_tokens_left > = 1 ) & & ( string_icompare ( tokens [ i ] , " CLR " ) = = 0 ) & & ( string_icompare ( tokens [ i + 1 ] , " PLASTIC " ) = = 0 ) )
{
new_token = " clear " ;
}
// CLR FOTOS exception (a guess, need to verify)
else if ( ( num_tokens_left > = 1 ) & & ( string_icompare ( tokens [ i ] , " CLR " ) = = 0 ) & & ( string_icompare ( tokens [ i + 1 ] , " FOTOS " ) = = 0 ) )
{
new_token = " clear " ;
}
// CLR FOTO exception (a guess, need to verify)
else if ( ( num_tokens_left > = 1 ) & & ( string_icompare ( tokens [ i ] , " CLR " ) = = 0 ) & & ( string_icompare ( tokens [ i + 1 ] , " FOTO " ) = = 0 ) )
{
new_token = " clear " ;
}
// CLR SHOT exception (a guess, need to verify)
else if ( ( num_tokens_left > = 1 ) & & ( string_icompare ( tokens [ i ] , " CLR " ) = = 0 ) & & ( string_icompare ( tokens [ i + 1 ] , " SHOT " ) = = 0 ) )
{
new_token = " clear " ;
}
// CLR BLUE exception
else if ( ( num_tokens_left > = 1 ) & & ( string_icompare ( tokens [ i ] , " CLR " ) = = 0 ) & & ( string_icompare ( tokens [ i + 1 ] , " BLUE " ) = = 0 ) )
{
new_token = " clear " ;
}
// CLR BUBBLE exception
else if ( ( num_tokens_left > = 1 ) & & ( string_icompare ( tokens [ i ] , " CLR " ) = = 0 ) & & ( string_icompare ( tokens [ i + 1 ] , " BUBBLE " ) = = 0 ) )
{
new_token = " clear " ;
}
// CLR BUBBLES exception
else if ( ( num_tokens_left > = 1 ) & & ( string_icompare ( tokens [ i ] , " CLR " ) = = 0 ) & & ( string_icompare ( tokens [ i + 1 ] , " BUBBLES " ) = = 0 ) )
{
new_token = " clear " ;
}
// S+Cu exception
else if ( ( num_tokens_left > = 2 ) & & ( tokens [ i ] = = " S " ) & & ( tokens [ i + 1 ] = = " + " ) & & ( tokens [ i + 2 ] = = " Cu " ) )
{
new_token = " S " ;
}
// IND OBS exception
else if ( ( num_tokens_left > = 1 ) & & ( tokens [ i ] = = " IND " ) & & ( tokens [ i + 1 ] = = " OBS " ) )
{
new_token = " independent " ;
}
// L<>R
else if ( ( num_tokens_left > = 3 ) & & ( tokens [ i ] = = " L " ) & & ( tokens [ i + 1 ] = = " < " ) & & ( tokens [ i + 2 ] = = " > " ) & & ( tokens [ i + 3 ] = = " R " ) )
{
new_token = " left and right " ;
i + = 3 ;
}
// <+>
else if ( ( num_tokens_left > = 2 ) & & ( tokens [ i ] = = " < " ) & & ( tokens [ i + 1 ] = = " + " ) & & ( tokens [ i + 2 ] = = " > " ) )
{
new_token = " left and right " ;
i + = 2 ;
}
else if ( orig_token = = " NFD " )
{
if ( ( ! has_next_token ) | | next_token_is_slash )
new_token = " No further details " ;
else
new_token = " No further details [in] " ;
}
// Up and down arrows
else if ( ( orig_token [ 0 ] = = 0x18 ) & &
( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] [ 0 ] = = ' + ' ) & &
( ( i + 2 ) < tokens . size ( ) ) & & ( tokens [ i + 2 ] [ 0 ] = = 0x19 ) )
{
const uint32_t at_end = ( ( i + 3 ) = = tokens . size ( ) ) | | ( tokens [ i + 3 ] [ 0 ] = = ' / ' ) ;
new_token = ! at_end ? " going up and down [to] " : " going up and down " ;
i + = 2 ;
}
// "V BRITE"
else if ( ( orig_token = = " V " ) & & ( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] = = " BRITE " ) )
{
new_token = " very bright " ;
i + + ;
}
// ++
else if ( ( orig_token = = " + " ) & & ( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] = = " + " ) )
{
new_token = " and more/others " ;
i + + ;
}
// >>
else if ( ( orig_token = = " > " ) & & ( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] = = " > " ) )
{
const uint32_t at_end = ( ( i + 2 ) = = tokens . size ( ) ) | | ( tokens [ i + 2 ] [ 0 ] = = ' / ' ) ;
new_token = ( ! at_end & & ! is_next_dir ) ? " going quickly [to] " : " going quickly " ;
i + + ;
}
// ><
else if ( ( orig_token = = " > " ) & & ( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] = = " < " ) )
{
new_token = " to/from " ;
i + + ;
}
// <>
else if ( ( orig_token = = " < " ) & & ( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] = = " > " ) )
{
// Larry said "between" but that sounds awkward and would require reordering tokens.
new_token = " to/from/between " ;
i + + ;
}
// >
else if ( orig_token = = " > " )
{
new_token = ( has_next_token & & ! next_token_is_slash & & ! is_next_dir ) ? " going [to] " : " going " ;
}
// Tree up arrows
else if ( ( orig_token [ 0 ] = = 0x18 ) & & ( num_tokens_left > = 2 ) & & ( tokens [ i + 1 ] [ 0 ] = = 0x18 ) & & ( tokens [ i + 2 ] [ 0 ] = = 0x18 ) )
{
const uint32_t at_end = ( ( i + 3 ) = = tokens . size ( ) ) | | ( tokens [ i + 3 ] [ 0 ] = = ' / ' ) ;
new_token = ! at_end ? " extremely quickly going up [to] " : " extremely quickly going up " ;
i + = 2 ;
}
// Two up arrows
else if ( ( orig_token [ 0 ] = = 0x18 ) & & ( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] [ 0 ] = = 0x18 ) )
{
const uint32_t at_end = ( ( i + 2 ) = = tokens . size ( ) ) | | ( tokens [ i + 2 ] [ 0 ] = = ' / ' ) ;
new_token = ! at_end ? " quickly going up [to] " : " quickly going up " ;
i + + ;
}
// Up arrow
else if ( orig_token [ 0 ] = = 0x18 )
{
new_token = ( has_next_token & & ! next_token_is_slash ) ? " going up [to] " : " going up " ;
}
// Two down arrows
else if ( ( orig_token [ 0 ] = = 0x19 ) & & ( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] [ 0 ] = = 0x19 ) )
{
const uint32_t at_end = ( ( i + 2 ) = = tokens . size ( ) ) | | ( tokens [ i + 2 ] [ 0 ] = = ' / ' ) ;
new_token = ! at_end ? " quickly going down [to] " : " quickly going down " ;
i + + ;
}
// Down arrow
else if ( orig_token [ 0 ] = = 0x19 )
{
new_token = ( has_next_token & & ! next_token_is_slash ) ? " going down [to] " : " going down " ;
}
// Two right arrows
else if ( ( orig_token [ 0 ] = = 0x1A ) & & ( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] [ 0 ] = = 0x1A ) )
{
const uint32_t at_end = ( ( i + 2 ) = = tokens . size ( ) ) | | ( tokens [ i + 2 ] [ 0 ] = = ' / ' ) ;
new_token = ! at_end ? " quickly going right [to] " : " quickly going right " ;
i + + ;
}
// Right arrow
else if ( orig_token [ 0 ] = = 0x1A )
{
new_token = ( has_next_token & & ! next_token_is_slash ) ? " going right [to] " : " going right " ;
}
// Two left arrows
else if ( ( orig_token [ 0 ] = = 0x1B ) & & ( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] [ 0 ] = = 0x1B ) )
{
const uint32_t at_end = ( ( i + 2 ) = = tokens . size ( ) ) | | ( tokens [ i + 2 ] [ 0 ] = = ' / ' ) ;
new_token = ! at_end ? " quickly going left [to] " : " quickly going left " ;
i + + ;
}
// Left arrow
else if ( orig_token [ 0 ] = = 0x1B )
{
new_token = ( has_next_token & & ! next_token_is_slash ) ? " going left [to] " : " going left " ;
}
// /
else if ( orig_token [ 0 ] = = ' / ' )
{
new_token = " / " ;
}
// +
else if ( orig_token [ 0 ] = = ' + ' )
{
if ( ! i )
new_token = " also " ;
else if ( ( i ! = ( tokens . size ( ) - 1 ) ) & & ( tokens [ i + 1 ] [ 0 ] ! = ' / ' ) )
new_token = " and " ;
else
new_token = " and more " ;
}
// @
else if ( orig_token [ 0 ] = = ' @ ' )
{
new_token = " at " ;
}
// dbl-word
else if ( ( string_icompare ( orig_token , " dbl " ) = = 0 ) & & ( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] = = " - " ) )
{
new_token = " double " ;
}
// GLOW-word
else if ( ( string_icompare ( orig_token , " GLOW " ) = = 0 ) & & ( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] = = " - " ) )
{
new_token = " glowing " ;
}
// A-test
else if ( ( orig_token = = " A " ) & & ( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] = = " - " ) & &
( ( i + 2 ) < tokens . size ( ) ) & & ( string_icompare ( tokens [ i + 2 ] , " TEST " ) = = 0 ) )
{
new_token = " atomic test " ;
i + = 2 ;
}
// A-plant
else if ( ( orig_token = = " A " ) & & ( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] = = " - " ) & &
( ( i + 2 ) < tokens . size ( ) ) & & ( string_icompare ( tokens [ i + 2 ] , " PLANT " ) = = 0 ) )
{
new_token = " atomic plant " ;
i + = 2 ;
}
// V-form
else if ( ( orig_token = = " V " ) & & ( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] = = " - " ) & &
( ( i + 2 ) < tokens . size ( ) ) & & ( string_icompare ( tokens [ i + 2 ] , " FORM " ) = = 0 ) )
{
new_token = " V-formation " ;
i + = 2 ;
}
// 1/2 (to fix spacing issues)
else if ( ( orig_token = = " 1 " ) & & ( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] = = " / " ) & &
( ( i + 2 ) < tokens . size ( ) ) & & ( tokens [ i + 2 ] = = " 2 " ) )
{
new_token = " 1/2 " ;
i + = 2 ;
}
// "W/O"
else if ( ( i ) & &
( string_icompare ( orig_token , " W " ) = = 0 ) & &
( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] = = " / " ) & &
( ( i + 2 ) < tokens . size ( ) ) & & ( string_icompare ( tokens [ i + 2 ] , " O " ) = = 0 ) )
{
new_token = " without " ;
i + = 2 ;
}
// "S/L"
else if ( ( orig_token = = " S " ) & &
( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] = = " / " ) & &
( ( i + 2 ) < tokens . size ( ) ) & & ( tokens [ i + 2 ] = = " L " ) )
{
// No idea what this means yet.
new_token = " straight and level " ;
i + = 2 ;
}
// "FOO-FIGHTERS"
else if ( ( orig_token = = " FOO " ) & &
( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] = = " - " ) & &
( ( i + 2 ) < tokens . size ( ) ) & & ( tokens [ i + 2 ] = = " FIGHTERS " ) )
{
// Just don't let the abbreviator kick in. Thanks Larry.
}
// "W/word"
else if ( ( i ) & &
( ( orig_token = = " W " ) | | ( orig_token = = " w " ) ) & &
( ( i + 1 ) < tokens . size ( ) ) & & ( tokens [ i + 1 ] = = " / " ) & &
( tokens [ i - 1 ] ! = " > " ) & &
( tokens [ i - 1 ] ! = " < " ) )
{
new_token = " with " ;
i + + ;
}
// "1="
else if ( orig_token = = " 1= " )
{
new_token = " one is [a] " ;
}
// Exception for "ORG RPT".
else if ( ( orig_token = = " ORG " ) & & has_next_token & & ( tokens [ i + 1 ] = = " RPT " ) )
{
new_token = " original " ;
}
// TODO: check for line 1 and don't expand these states
// Exception for ,MT (the state) - don't change to "Mt."
else if ( first_line & & orig_token = = " MI " & & has_prev_token & & tokens [ i - 1 ] = = " , " )
{
}
// Exception for ,MT (the state) - don't change to "Mt."
else if ( first_line & & orig_token = = " MT " & & has_prev_token & & tokens [ i - 1 ] = = " , " )
{
}
// Exception for ,NE (the state) - don't change to "northeast"
else if ( first_line & & orig_token = = " NE " & & has_prev_token & & tokens [ i - 1 ] = = " , " )
{
}
// Exception for ,MS (the state) - don't change to "northeast"
else if ( first_line & & orig_token = = " MS " & & has_prev_token & & tokens [ i - 1 ] = = " , " )
{
}
// Exception for ,AL (the state) - don't change to "northeast"
else if ( first_line & & orig_token = = " AL " & & has_prev_token & & tokens [ i - 1 ] = = " , " )
{
}
else
{
expand_abbreviations ( first_line , orig_token , tokens , i , toks ) ;
continue ;
}
if ( new_token . size ( ) )
toks . push_back ( token ( new_token , ! first_line & & ( new_token = = tokens [ i ] ) , false ) ) ;
}
// Phase 4: Compose the final string, converting tokens to lower/uppercase and inserting spaces as needed.
std : : string new_str ;
bool in_quote = false ;
for ( uint32_t i = 0 ; i < toks . size ( ) ; i + + )
{
std : : string new_token ( toks [ i ] . m_token ) ;
if ( ! new_token . size ( ) )
continue ;
if ( ! first_line )
new_token = fix_capitilization ( toks , i ) ;
// Add a space if the previous string is not empty - excluding special cases where a space isn't necessary.
if ( new_str . size ( ) & &
( new_token ! = " .. " ) & &
( new_token ! = " , " ) & &
( new_token ! = " ! " ) & & ( new_token ! = " ? " ) & &
( new_token ! = " + " ) & &
( ! ( ( new_token = = " ) " ) & & ( new_str . back ( ) = = ' ? ' ) ) ) & &
( new_token ! = " ; " ) & & ( new_str . back ( ) ! = ' ; ' ) & &
( new_token ! = " - " ) & & ( new_str . back ( ) ! = ' - ' ) & &
( new_str . back ( ) ! = ' # ' ) & &
( new_str . back ( ) ! = ' + ' ) & &
( ! ( in_quote & & ( new_token = = " \" " ) & & new_str . size ( ) & & is_sentence_ender ( new_str . back ( ) ) ) )
)
{
new_str . push_back ( ' ' ) ;
//new_str.push_back('*');
}
// Append the token string to the output string
new_str + = new_token ;
for ( uint8_t c : new_token )
if ( c = = ' \" ' )
in_quote = ! in_quote ;
}
return new_str ;
}
static void decode_hatch_desc ( const udb_rec * pRec , std : : string & db_str , std : : string & loc_str , std : : string & desc_str )
{
for ( uint32_t i = 0 ; i < UDB_REC_TEXT_SIZE ; i + + )
{
if ( pRec - > get_text ( ) [ i ] = = 0 )
break ;
db_str . push_back ( pRec - > get_text ( ) [ i ] ) ;
}
std : : string orig_desc ( db_str ) ;
string_vec desc ;
for ( ; ; )
{
size_t pos = orig_desc . find_first_of ( ' : ' ) ;
if ( pos = = std : : string : : npos )
{
desc . push_back ( string_trim ( orig_desc ) ) ;
break ;
}
else
{
std : : string s ( orig_desc ) ;
s . erase ( pos , s . size ( ) - pos ) ;
desc . push_back ( string_trim ( s ) ) ;
orig_desc . erase ( 0 , pos + 1 ) ;
}
}
for ( uint32_t i = 0 ; i < desc . size ( ) ; i + + )
{
std : : string str ( decode_hatch ( desc [ i ] , ! i ) ) ;
if ( ! str . size ( ) )
continue ;
if ( desc_str . size ( ) )
{
if ( desc_str . back ( ) ! = ' . ' & & desc_str . back ( ) ! = ' ! ' & & desc_str . back ( ) ! = ' ? ' )
desc_str + = " . " ;
desc_str + = " " ;
}
if ( ! i )
{
loc_str = string_upper ( str ) ;
}
else
{
if ( uislower ( str [ 0 ] ) )
str [ 0 ] = utoupper ( str [ 0 ] ) ;
else if ( ( str [ 0 ] = = ' \" ' ) & & ( str . size ( ) > = 2 ) & & ( uislower ( str [ 1 ] ) ) )
str [ 1 ] = utoupper ( str [ 1 ] ) ;
else if ( ( str [ 0 ] = = ' \' ' ) & & ( str . size ( ) > = 2 ) & & ( uislower ( str [ 1 ] ) ) )
str [ 1 ] = utoupper ( str [ 1 ] ) ;
else if ( ( str [ 0 ] = = ' ( ' ) & & ( str . size ( ) > = 2 ) & & ( uislower ( str [ 1 ] ) ) )
str [ 1 ] = utoupper ( str [ 1 ] ) ;
desc_str + = str ;
}
}
if ( desc_str . size ( ) & & desc_str . back ( ) ! = ' . ' & & desc_str . back ( ) ! = ' ! ' & & desc_str . back ( ) ! = ' ? ' )
{
if ( ( desc_str . back ( ) = = ' ) ' ) & & ( ! string_ends_in ( desc_str , " (s) " ) ) )
{
desc_str . pop_back ( ) ;
if ( desc_str . back ( ) = = ' ' )
desc_str . pop_back ( ) ;
if ( desc_str . size ( ) & & desc_str . back ( ) ! = ' . ' & & desc_str . back ( ) ! = ' ! ' & & desc_str . back ( ) ! = ' ? ' )
desc_str + = " . " ;
desc_str + = " ) " ;
}
else
{
desc_str + = " . " ;
}
}
db_str = dos_to_utf8 ( db_str ) ;
loc_str = dos_to_utf8 ( loc_str ) ;
desc_str = dos_to_utf8 ( desc_str ) ;
}
template < typename T >
static void check_for_hatch_tab_dups ( const T & tab )
{
std : : unordered_set < int > ids ;
for ( const auto & x : tab )
if ( ! ids . insert ( x . m_ref ) . second )
panic ( " Duplicate hatch ref table id " ) ;
}
static void init_dict ( )
{
string_vec dict ;
uprintf ( " Reading dictionary \n " ) ;
bool utf8_flag = false ;
2023-02-24 14:23:55 -05:00
if ( ! read_text_file ( " uppercase_dict.txt " , dict , true , & utf8_flag ) )
2023-02-20 17:59:08 -05:00
panic ( " Failed reading uppercase_dict.txt " ) ;
for ( auto str : dict )
{
string_trim ( str ) ;
if ( str . size ( ) & & uisupper ( str [ 0 ] ) )
{
g_dictionary . insert ( std : : make_pair ( string_lower ( str ) , str ) ) ;
}
}
uprintf ( " Done reading dictionary, %u uppercase words \n " , g_dictionary . size ( ) ) ;
}
void udb_init ( )
{
assert ( sizeof ( udb_rec ) = = UDB_RECORD_SIZE ) ;
check_for_hatch_tab_dups ( g_hatch_refs ) ;
check_for_hatch_tab_dups ( g_hatch_refs_93 ) ;
check_for_hatch_tab_dups ( g_hatch_refs_96 ) ;
check_for_hatch_tab_dups ( g_hatch_refs_97 ) ;
check_for_hatch_tab_dups ( g_hatch_refs_98 ) ;
for ( uint32_t i = 0 ; i < std : : size ( g_hatch_refs ) ; i + + )
g_hatch_refs_tab [ g_hatch_refs [ i ] . m_ref ] = g_hatch_refs [ i ] . m_pDesc ;
init_hatch_abbreviations_map ( ) ;
init_hatch_cap_exception_tokens ( ) ;
init_dict ( ) ;
}
bool udb_dump ( )
{
uint8_vec udb ;
if ( ! read_binary_file ( " u.rnd " , udb ) )
return false ;
const uint32_t TOTAL_RECS = 18123 ;
if ( ( udb . size ( ) / UDB_RECORD_SIZE ) < TOTAL_RECS )
panic ( " Invalid file size " ) ;
string_vec output ;
const udb_rec * pRecs = reinterpret_cast < const udb_rec * > ( & udb . front ( ) ) ;
for ( uint32_t rec_index = 1 ; rec_index < TOTAL_RECS ; rec_index + + )
//for (uint32_t rec_index = 18038; rec_index <= 18038; rec_index++)
{
const udb_rec * pRec = pRecs + rec_index ;
std : : string db_str , loc_str , desc_str ;
decode_hatch_desc ( pRec , db_str , loc_str , desc_str ) ;
event_date ed ;
pRec - > get_date ( ed ) ;
std : : string date_str ( ed . get_string ( ) ) ;
{
uprintf ( " \n ----------%u: Date: %s, Strangeness: %u, Credibility: %u \n " , rec_index , date_str . c_str ( ) , pRec - > get_strangeness ( ) , pRec - > get_credibility ( ) ) ;
std : : string time ;
if ( pRec - > get_time ( time ) )
uprintf ( " Time: %s \n " , time . c_str ( ) ) ;
if ( pRec - > get_duration ( ) )
uprintf ( " Duration: %u mins \n " , pRec - > get_duration ( ) ) ;
if ( pRec - > get_elevation ( ) ! = - 99 )
uprintf ( " Elevation: %im \n " , pRec - > get_elevation ( ) ) ;
if ( ( pRec - > get_rel_altitude ( ) ! = 0 ) & & ( pRec - > get_rel_altitude ( ) ! = 999 ) )
uprintf ( " Altitude: %im \n " , pRec - > get_rel_altitude ( ) ) ;
uprintf ( " Location: %s \n " , loc_str . c_str ( ) ) ;
std : : string country_name , state_or_prov_name ;
pRec - > get_geo ( country_name , state_or_prov_name ) ;
const uint32_t continent_code = pRec - > get_continent_code ( ) ;
uprintf ( " Country: %s, State/Province: %s (%s), Continent: %s \n " , country_name . c_str ( ) , state_or_prov_name . c_str ( ) , pRec - > get_state_or_prov ( ) . c_str ( ) ,
( continent_code < std : : size ( g_hatch_continents ) ) ? g_hatch_continents [ continent_code ] : " ? " ) ;
uprintf ( " Latitude/Longitude: %f %f, %s %s \n " , pRec - > get_latitude ( ) , pRec - > get_longitude ( ) , pRec - > get_latitude_dms ( ) . c_str ( ) , pRec - > get_longitude_dms ( ) . c_str ( ) ) ;
const uint32_t locale = pRec - > get_locale ( ) ;
if ( locale < std : : size ( g_hatch_locales ) )
uprintf ( " Locale: %s \n " , g_hatch_locales [ locale ] ) ;
uprintf ( " UDB Desc: %s \n " , db_str . c_str ( ) ) ;
uprintf ( " Decoded Desc: %s \n " , desc_str . c_str ( ) ) ;
uint32_t total_flags = 0 ;
for ( uint32_t f = 0 ; f < udb_rec : : cMaxFlags ; f + + )
{
if ( ! f ) // map
continue ;
if ( pRec - > get_flag ( f ) )
total_flags + + ;
}
if ( total_flags )
{
uprintf ( " Flags: " ) ;
uint32_t num_flags_printed = 0 ;
for ( uint32_t f = 0 ; f < udb_rec : : cMaxFlags ; f + + )
{
if ( ! f ) // map
continue ;
if ( pRec - > get_flag ( f ) )
{
uprintf ( " %s " , g_pHatch_flag_descs [ f ] ) ;
num_flags_printed + + ;
if ( num_flags_printed < total_flags )
{
uprintf ( " , " ) ;
if ( ( num_flags_printed % 2 ) = = 0 )
uprintf ( " \n " ) ;
}
}
}
uprintf ( " \n " ) ;
}
uprintf ( " Ref: %s \n " , pRec - > get_full_refs ( ) . c_str ( ) ) ;
}
output . push_back ( string_format ( " Date: %s \n Location: \" %s \" \n Description: \" %s \" \n " , date_str . c_str ( ) , loc_str . c_str ( ) , desc_str . c_str ( ) ) ) ;
}
string_vec toks ;
for ( const auto & str : g_unique_tokens )
toks . push_back ( str ) ;
write_text_file ( " unique_tokens.txt " , toks , false ) ;
write_text_file ( " output.txt " , output , true ) ;
return true ;
}
static bool convert_rec ( uint32_t rec_index , const udb_rec * pRec , timeline_event & event )
{
std : : string db_str , loc_str , desc_str ;
decode_hatch_desc ( pRec , db_str , loc_str , desc_str ) ;
pRec - > get_date ( event . m_begin_date ) ;
if ( event . m_begin_date . m_year < = 0 )
return false ;
std : : string time ;
if ( pRec - > get_time ( time ) )
2023-02-24 14:23:55 -05:00
{
if ( time ! = " 00:00? " )
event . m_time_str = time ;
}
2023-02-20 17:59:08 -05:00
event . m_date_str = event . m_begin_date . get_string ( ) ;
event . m_locations . push_back ( loc_str ) ;
event . m_desc = desc_str ;
// TODO
event . m_type . push_back ( " sighting " ) ;
event . m_source_id = string_format ( " Hatch_UDB_%u " , rec_index ) ;
event . m_source = " Hatch " ;
for ( uint32_t f = 0 ; f < udb_rec : : cMaxFlags ; f + + )
if ( ( f ! = cFlagMAP ) & & ( pRec - > get_flag ( f ) ) )
event . m_attributes . push_back ( g_pHatch_flag_descs [ f ] ) ;
event . m_refs . push_back ( pRec - > get_full_refs ( ) ) ;
2023-02-24 14:23:55 -05:00
event . m_key_value_data . push_back ( std : : make_pair ( " LocationLink " , string_format ( " [Google Maps](https://www.google.com/maps/place/%f,%f) " , pRec - > get_latitude ( ) , pRec - > get_longitude ( ) ) ) ) ;
event . m_key_value_data . push_back ( std : : make_pair ( " LatLong " , string_format ( " %f %f " , pRec - > get_latitude ( ) , pRec - > get_longitude ( ) ) ) ) ;
event . m_key_value_data . push_back ( std : : make_pair ( " LatLongDMS " , string_format ( " %s %s " , pRec - > get_latitude_dms ( ) . c_str ( ) , pRec - > get_longitude_dms ( ) . c_str ( ) ) ) ) ;
2023-02-20 17:59:08 -05:00
2023-02-24 14:23:55 -05:00
event . m_key_value_data . push_back ( std : : make_pair ( " HatchDesc " , db_str ) ) ;
2023-02-20 17:59:08 -05:00
2023-02-24 14:23:55 -05:00
event . m_key_value_data . push_back ( std : : make_pair ( " Duration " , string_format ( " %u " , pRec - > get_duration ( ) ) ) ) ;
2023-02-20 17:59:08 -05:00
std : : string country_name , state_or_prov_name ;
pRec - > get_geo ( country_name , state_or_prov_name ) ;
2023-02-24 14:23:55 -05:00
event . m_key_value_data . push_back ( std : : make_pair ( " Country " , country_name ) ) ;
event . m_key_value_data . push_back ( std : : make_pair ( " State/Prov " , state_or_prov_name ) ) ;
2023-02-20 17:59:08 -05:00
2023-02-24 14:23:55 -05:00
event . m_key_value_data . push_back ( std : : make_pair ( " Strangeness " , string_format ( " %u " , pRec - > get_strangeness ( ) ) ) ) ;
event . m_key_value_data . push_back ( std : : make_pair ( " Credibility " , string_format ( " %u " , pRec - > get_credibility ( ) ) ) ) ;
2023-02-20 17:59:08 -05:00
const uint32_t locale = pRec - > get_locale ( ) ;
if ( locale < std : : size ( g_hatch_locales ) )
2023-02-24 14:23:55 -05:00
event . m_key_value_data . push_back ( std : : make_pair ( " Locale " , g_hatch_locales [ locale ] ) ) ;
2023-02-20 17:59:08 -05:00
if ( pRec - > get_elevation ( ) ! = - 99 )
2023-02-24 14:23:55 -05:00
event . m_key_value_data . push_back ( std : : make_pair ( " Elev " , string_format ( " %i " , pRec - > get_elevation ( ) ) ) ) ;
2023-02-20 17:59:08 -05:00
if ( ( pRec - > get_rel_altitude ( ) ! = 0 ) & & ( pRec - > get_rel_altitude ( ) ! = 999 ) )
2023-02-24 14:23:55 -05:00
event . m_key_value_data . push_back ( std : : make_pair ( " RelAlt " , string_format ( " %i " , pRec - > get_rel_altitude ( ) ) ) ) ;
2023-02-20 17:59:08 -05:00
return true ;
}
bool udb_convert ( )
{
uint8_vec udb ;
if ( ! read_binary_file ( " u.rnd " , udb ) )
return false ;
const uint32_t TOTAL_RECS = 18123 ;
if ( ( udb . size ( ) / UDB_RECORD_SIZE ) < TOTAL_RECS )
panic ( " Invalid file size " ) ;
const udb_rec * pRecs = reinterpret_cast < const udb_rec * > ( & udb . front ( ) ) ;
ufo_timeline timeline ;
for ( uint32_t rec_index = 1 ; rec_index < TOTAL_RECS ; rec_index + + )
{
const udb_rec * pRec = pRecs + rec_index ;
timeline_event event ;
if ( ! convert_rec ( rec_index , pRec , event ) )
continue ;
timeline . get_events ( ) . push_back ( event ) ;
}
if ( ! timeline . get_events ( ) . size ( ) )
panic ( " Empty timeline) " ) ;
timeline . set_name ( " Hatch_UDB_Timeline " ) ;
return timeline . write_file ( " hatch_udb.json " , true ) ;
}