mirror of
https://github.com/richgel999/ufo_data.git
synced 2024-12-25 15:39:34 -05:00
3410 lines
110 KiB
C++
3410 lines
110 KiB
C++
// converters.cpp
|
||
// Copyright (C) 2023 Richard Geldreich, Jr.
|
||
#include "ufojson_core.h"
|
||
#include "markdown_proc.h"
|
||
#include <unordered_map>
|
||
|
||
#define USE_OPENAI (0)
|
||
|
||
// Escapes quoted strings.
|
||
static std::string escape_string_for_json(const std::string& str)
|
||
{
|
||
std::string new_str;
|
||
for (uint32_t i = 0; i < str.size(); i++)
|
||
{
|
||
char c = str[i];
|
||
if (c == '"')
|
||
new_str.push_back('\\');
|
||
else if (c == '\\')
|
||
new_str.push_back('\\');
|
||
else if (c == '\n')
|
||
{
|
||
new_str.push_back('\\');
|
||
new_str.push_back('n');
|
||
continue;
|
||
}
|
||
|
||
new_str.push_back(c);
|
||
}
|
||
|
||
return new_str;
|
||
}
|
||
|
||
bool convert_magnonia(const char* pSrc_filename, const char* pDst_filename, const char* pSource_override, const char* pRef_override)
|
||
{
|
||
string_vec lines;
|
||
|
||
if (!read_text_file(pSrc_filename, lines, true, nullptr))
|
||
panic("Can't open file %s", pSrc_filename);
|
||
|
||
FILE* pOut_file = ufopen(pDst_filename, "w");
|
||
if (!pOut_file)
|
||
panic("Can't open file %s", pDst_filename);
|
||
|
||
fprintf(pOut_file, "{\n");
|
||
fprintf(pOut_file, "\"%s Timeline\" : [\n", pSource_override ? pSource_override : "Magnonia");
|
||
|
||
//const uint32_t TOTAL_RECS = 923;
|
||
const uint32_t TOTAL_COLS = 15;
|
||
|
||
uint32_t cur_line = 0;
|
||
uint32_t rec_index = 1;
|
||
while (cur_line < lines.size())
|
||
{
|
||
if (!lines[cur_line].size())
|
||
panic("Line %u is empty", cur_line);
|
||
|
||
int index = atoi(lines[cur_line++].c_str());
|
||
if (index != (int)rec_index)
|
||
panic("Unexpected index");
|
||
|
||
if (cur_line == lines.size())
|
||
panic("Out of lines");
|
||
|
||
std::string first_line(lines[cur_line++]);
|
||
|
||
std::string date_str(first_line);
|
||
if (date_str.size() > TOTAL_COLS)
|
||
date_str.resize(TOTAL_COLS);
|
||
string_trim(date_str);
|
||
|
||
if (first_line.size() < (TOTAL_COLS + 1))
|
||
{
|
||
if (cur_line == lines.size())
|
||
panic("Out of lines");
|
||
|
||
first_line = lines[cur_line++];
|
||
if (first_line.size() < (TOTAL_COLS + 1))
|
||
panic("Line too small");
|
||
}
|
||
|
||
string_vec desc_lines;
|
||
first_line.erase(0, TOTAL_COLS);
|
||
string_trim(first_line);
|
||
desc_lines.push_back(first_line);
|
||
|
||
std::string time_str;
|
||
|
||
for (; ; )
|
||
{
|
||
if (cur_line == lines.size())
|
||
break;
|
||
|
||
if (lines[cur_line].size() < TOTAL_COLS)
|
||
break;
|
||
|
||
std::string buf(lines[cur_line]);
|
||
|
||
if (desc_lines.size() == 1)
|
||
{
|
||
if (buf.size() >= TOTAL_COLS)
|
||
{
|
||
time_str = buf;
|
||
time_str.resize(TOTAL_COLS);
|
||
string_trim(time_str);
|
||
|
||
buf.erase(0, TOTAL_COLS);
|
||
}
|
||
}
|
||
|
||
string_trim(buf);
|
||
desc_lines.push_back(buf);
|
||
|
||
cur_line++;
|
||
}
|
||
|
||
std::string desc;
|
||
for (uint32_t j = 0; j < desc_lines.size(); j++)
|
||
{
|
||
if (desc.size() && desc.back() == '-')
|
||
{
|
||
// Don't trim '-' char if the previous char is a digit. This is probably imperfect.
|
||
if (!((desc.size() >= 2) && (isdigit((uint8_t)desc[desc.size() - 2]))))
|
||
desc.resize(desc.size() - 1);
|
||
}
|
||
else if (desc.size())
|
||
{
|
||
desc += " ";
|
||
}
|
||
|
||
desc += desc_lines[j];
|
||
}
|
||
|
||
std::string location;
|
||
|
||
size_t n = desc.find_first_of('.');
|
||
if (n == std::string::npos)
|
||
panic("Can't find . char");
|
||
|
||
location = desc;
|
||
location.resize(n);
|
||
string_trim(location);
|
||
|
||
size_t f = location.find_first_of('(');
|
||
size_t e = location.find_last_of(')');
|
||
if ((f != std::string::npos) && (e == location.size() - 1))
|
||
{
|
||
std::string state(location);
|
||
state.erase(0, f + 1);
|
||
if (state.size())
|
||
state.resize(state.size() - 1);
|
||
string_trim(state);
|
||
|
||
location.erase(f, location.size() - f);
|
||
string_trim(location);
|
||
|
||
location += ", ";
|
||
location += state;
|
||
}
|
||
|
||
desc.erase(0, n + 1);
|
||
string_trim(desc);
|
||
|
||
std::string ref;
|
||
f = desc.find_last_of('(');
|
||
e = desc.find_last_of(')');
|
||
if ((f != std::string::npos) && (e != std::string::npos))
|
||
{
|
||
if ((f < e) && (e == desc.size() - 1))
|
||
{
|
||
ref = desc.c_str() + f + 1;
|
||
if (ref.size())
|
||
ref.pop_back();
|
||
string_trim(ref);
|
||
|
||
desc.resize(f);
|
||
string_trim(desc);
|
||
}
|
||
}
|
||
|
||
int year = -1, month = -1, day = -1;
|
||
date_prefix_t date_prefix = cNoPrefix;
|
||
|
||
std::string temp_date_str(date_str);
|
||
|
||
if (string_begins_with(temp_date_str, "End "))
|
||
{
|
||
date_prefix = cEndOf;
|
||
temp_date_str.erase(0, strlen("End "));
|
||
string_trim(temp_date_str);
|
||
}
|
||
else if (string_begins_with(temp_date_str, "Early "))
|
||
{
|
||
date_prefix = cEarly;
|
||
temp_date_str.erase(0, strlen("Early "));
|
||
string_trim(temp_date_str);
|
||
}
|
||
|
||
f = date_str.find_first_of(',');
|
||
uint32_t m;
|
||
for (m = 0; m < 12; m++)
|
||
if (string_begins_with(temp_date_str, g_months[m]))
|
||
break;
|
||
|
||
if (m != 12)
|
||
{
|
||
month = m + 1;
|
||
|
||
temp_date_str.erase(0, strlen(g_months[m]));
|
||
string_trim(temp_date_str);
|
||
|
||
if (!temp_date_str.size())
|
||
panic("Failed parsing date");
|
||
|
||
f = temp_date_str.find_first_of(',');
|
||
if (f != std::string::npos)
|
||
{
|
||
if (f == 0)
|
||
{
|
||
temp_date_str.erase(0, 1);
|
||
string_trim(temp_date_str);
|
||
|
||
if (!isdigit(temp_date_str[0]))
|
||
panic("Failed parsing date");
|
||
|
||
year = atoi(temp_date_str.c_str());
|
||
if ((year <= 100) || (year > 2050))
|
||
panic("Failed parsing date");
|
||
}
|
||
else
|
||
{
|
||
day = atoi(temp_date_str.c_str());
|
||
|
||
temp_date_str.erase(0, f + 1);
|
||
string_trim(temp_date_str);
|
||
|
||
if (!isdigit(temp_date_str[0]))
|
||
panic("Failed parsing date");
|
||
|
||
year = atoi(temp_date_str.c_str());
|
||
if ((year <= 100) || (year > 2050))
|
||
panic("Failed parsing date");
|
||
}
|
||
}
|
||
else
|
||
{
|
||
if (!isdigit(temp_date_str[0]))
|
||
panic("Failed parsing date");
|
||
|
||
year = atoi(temp_date_str.c_str());
|
||
if ((year <= 100) || (year > 2050))
|
||
panic("Failed parsing date");
|
||
}
|
||
|
||
}
|
||
else
|
||
{
|
||
// The Magnonia data doesn't use the full range of prefixes we support.
|
||
if (string_begins_with(temp_date_str, "Summer,"))
|
||
{
|
||
date_prefix = cSummer;
|
||
temp_date_str.erase(0, strlen("Summer,"));
|
||
string_trim(temp_date_str);
|
||
}
|
||
else if (string_begins_with(temp_date_str, "Spring,"))
|
||
{
|
||
date_prefix = cSpring;
|
||
temp_date_str.erase(0, strlen("Spring,"));
|
||
string_trim(temp_date_str);
|
||
}
|
||
else if (string_begins_with(temp_date_str, "Fall,"))
|
||
{
|
||
date_prefix = cFall;
|
||
temp_date_str.erase(0, strlen("Fall,"));
|
||
string_trim(temp_date_str);
|
||
}
|
||
else if (string_begins_with(temp_date_str, "End "))
|
||
{
|
||
date_prefix = cEndOf;
|
||
temp_date_str.erase(0, strlen("End "));
|
||
string_trim(temp_date_str);
|
||
}
|
||
else if (string_begins_with(temp_date_str, "Early "))
|
||
{
|
||
date_prefix = cEarly;
|
||
temp_date_str.erase(0, strlen("Early "));
|
||
string_trim(temp_date_str);
|
||
}
|
||
|
||
if (!isdigit(temp_date_str[0]))
|
||
panic("Failed parsing date");
|
||
|
||
year = atoi(temp_date_str.c_str());
|
||
if ((year <= 100) || (year > 2050))
|
||
panic("Failed parsing date");
|
||
}
|
||
|
||
if (pRef_override)
|
||
{
|
||
if (ref.size())
|
||
ref += " ";
|
||
ref += pRef_override;
|
||
}
|
||
else
|
||
ref += " ([Vallee](https://web.archive.org/web/20120415100852/http://www.ufoinfo.com/magonia/magonia.shtml))";
|
||
|
||
//printf("## %u. Date: \"%s\" (%s %i/%i/%i), Time: \"%s\" Location: \"%s\", Ref: \"%s\"\n",
|
||
// i, date_str.c_str(), (date_prefix >= 0) ? g_date_prefix_strings[date_prefix] : "", month, day, year, time_str.c_str(), location.c_str(), ref.c_str());
|
||
//printf("%s\n", desc.c_str());
|
||
|
||
if (rec_index > 1)
|
||
fprintf(pOut_file, ",\n");
|
||
|
||
fprintf(pOut_file, "{\n");
|
||
fprintf(pOut_file, " \"date\" : \"");
|
||
|
||
if (date_prefix >= 0)
|
||
fprintf(pOut_file, "%s ", g_date_prefix_strings[date_prefix]);
|
||
|
||
if (month == -1)
|
||
fprintf(pOut_file, "%i", year);
|
||
else if (day == -1)
|
||
fprintf(pOut_file, "%i/%i", month, year);
|
||
else
|
||
fprintf(pOut_file, "%i/%i/%i", month, day, year);
|
||
fprintf(pOut_file, "\",\n");
|
||
|
||
fprintf(pOut_file, " \"desc\": \"%s\",\n", escape_string_for_json(desc).c_str());
|
||
|
||
if (location.size())
|
||
fprintf(pOut_file, " \"location\" : \"%s\",\n", escape_string_for_json(location).c_str());
|
||
|
||
if (time_str.size())
|
||
fprintf(pOut_file, " \"time\" : \"%s\",\n", time_str.c_str());
|
||
|
||
if (ref.size())
|
||
fprintf(pOut_file, " \"ref\": \"%s\",\n", escape_string_for_json(ref).c_str());
|
||
|
||
fprintf(pOut_file, " \"source_id\" : \"Magnonia_%u\",\n", rec_index);
|
||
|
||
fprintf(pOut_file, u8" \"source\" : \"%s\",\n", pSource_override ? pSource_override : u8"Vall<EFBFBD>eMagnonia");
|
||
|
||
fprintf(pOut_file, " \"type\" : \"ufo sighting\"\n");
|
||
|
||
fprintf(pOut_file, "}");
|
||
|
||
rec_index++;
|
||
}
|
||
|
||
fprintf(pOut_file, "\n] }\n");
|
||
fclose(pOut_file);
|
||
|
||
return true;
|
||
}
|
||
|
||
bool convert_bluebook_unknowns()
|
||
{
|
||
string_vec lines;
|
||
|
||
if (!read_text_file("bb_unknowns.txt", lines, true, nullptr))
|
||
panic("Can't read file bb_unknowns.txt");
|
||
|
||
uint32_t cur_line = 0;
|
||
|
||
uint32_t total_recs = 0;
|
||
|
||
FILE* pOut_file = ufopen("bb_unknowns.json", "w");
|
||
if (!pOut_file)
|
||
panic("Can't open output file bb_unknowns.json");
|
||
|
||
fputc(UTF8_BOM0, pOut_file);
|
||
fputc(UTF8_BOM1, pOut_file);
|
||
fputc(UTF8_BOM2, pOut_file);
|
||
|
||
fprintf(pOut_file, "{\n");
|
||
fprintf(pOut_file, "\"BlueBookUnknowns Timeline\" : [\n");
|
||
|
||
while (cur_line < lines.size())
|
||
{
|
||
std::string rec;
|
||
|
||
while (cur_line < lines.size())
|
||
{
|
||
std::string l(lines[cur_line]);
|
||
cur_line++;
|
||
|
||
string_trim(l);
|
||
|
||
if (!l.size())
|
||
break;
|
||
|
||
if (rec.size() && rec.back() == '-')
|
||
rec += l;
|
||
else
|
||
{
|
||
rec += ' ';
|
||
rec += l;
|
||
}
|
||
}
|
||
|
||
if (rec.size())
|
||
{
|
||
//printf("%u. %s\n", total_recs + 1, rec.c_str());
|
||
|
||
size_t semi_ofs = rec.find_first_of(';');
|
||
if (semi_ofs == std::string::npos)
|
||
panic("Unable to find initial ; char");
|
||
|
||
std::string date_str(rec);
|
||
date_str.resize(semi_ofs);
|
||
string_trim(date_str);
|
||
|
||
rec.erase(0, semi_ofs + 1);
|
||
string_trim(rec);
|
||
|
||
size_t period_ofs = rec.find_first_of(';');
|
||
if (period_ofs == std::string::npos)
|
||
{
|
||
period_ofs = rec.find_first_of('.');
|
||
if (period_ofs == std::string::npos)
|
||
panic("Unable to find . char");
|
||
|
||
if (((period_ofs >= 2) && (rec[period_ofs - 1] == 't') && (rec[period_ofs - 2] == 'F')) ||
|
||
((period_ofs >= 2) && (rec[period_ofs - 1] == 't') && (rec[period_ofs - 2] == 'M')))
|
||
{
|
||
period_ofs = rec.find('.', period_ofs + 1);
|
||
if (period_ofs == std::string::npos)
|
||
panic("Unable to find . char");
|
||
}
|
||
}
|
||
|
||
std::string location_str(rec);
|
||
location_str.resize(period_ofs);
|
||
string_trim(location_str);
|
||
|
||
rec.erase(0, period_ofs + 1);
|
||
string_trim(rec);
|
||
|
||
size_t time_period_ofs = rec.find_first_of('.');
|
||
if (time_period_ofs == std::string::npos)
|
||
panic("Unable to find , char");
|
||
|
||
std::string time_str(rec);
|
||
time_str.resize(time_period_ofs);
|
||
string_trim(time_str);
|
||
|
||
rec.erase(0, time_period_ofs + 1);
|
||
string_trim(rec);
|
||
|
||
//printf("Rec: %u\n", total_recs + 1);
|
||
//printf("Location: %s\n", location_str.c_str());
|
||
//printf("Time: %s\n", time_str.c_str());
|
||
//printf("Desc: %s\n", rec.c_str());
|
||
|
||
std::string json_date;
|
||
if ((string_begins_with(date_str, "Spring")) || (string_begins_with(date_str, "Summer")))
|
||
{
|
||
json_date = date_str;
|
||
}
|
||
else
|
||
{
|
||
size_t date_space = date_str.find_first_of(' ');
|
||
if (date_space == std::string::npos)
|
||
panic("Unable to find space char");
|
||
|
||
size_t date_comma = date_str.find_first_of(',');
|
||
|
||
int month;
|
||
|
||
if (string_begins_with(date_str, "Sept."))
|
||
month = 8;
|
||
else
|
||
{
|
||
for (month = 0; month < 12; month++)
|
||
{
|
||
if (string_begins_with(date_str, g_months[month]))
|
||
break;
|
||
if (string_begins_with(date_str, g_full_months[month]))
|
||
break;
|
||
}
|
||
|
||
if (month == 12)
|
||
panic("Failed finding month");
|
||
}
|
||
|
||
char buf[256];
|
||
|
||
int day, year;
|
||
if (date_comma == std::string::npos)
|
||
{
|
||
year = atoi(date_str.c_str() + date_space + 1);
|
||
if ((year < 1900) || (year > 1969))
|
||
panic("Invalid year");
|
||
|
||
sprintf_s(buf, "%u/%u", month + 1, year);
|
||
}
|
||
else
|
||
{
|
||
day = atoi(date_str.c_str() + date_space + 1);
|
||
if ((day < 1) || (day > 31))
|
||
panic("Invalid day");
|
||
|
||
year = atoi(date_str.c_str() + date_comma + 1);
|
||
if ((year < 1900) || (year > 1969))
|
||
panic("Invalid year");
|
||
|
||
sprintf_s(buf, "%u/%u/%u", month + 1, day, year);
|
||
}
|
||
|
||
json_date = buf;
|
||
}
|
||
//printf("JSON Date: %s\n", date_str.c_str());
|
||
//printf("Date: %s\n", json_date.c_str());
|
||
|
||
fprintf(pOut_file, "{\n");
|
||
fprintf(pOut_file, " \"date\" : \"%s\",\n", json_date.c_str());
|
||
fprintf(pOut_file, " \"time\" : \"%s\",\n", time_str.c_str());
|
||
fprintf(pOut_file, " \"location\" : \"%s\",\n", escape_string_for_json(location_str).c_str());
|
||
fprintf(pOut_file, " \"desc\" : \"%s\",\n", escape_string_for_json(rec).c_str());
|
||
fprintf(pOut_file, " \"source_id\" : \"BerlinerBBU_%u\",\n", total_recs);
|
||
fprintf(pOut_file, " \"source\" : \"BerlinerBBUnknowns\",\n");
|
||
fprintf(pOut_file, " \"ref\" : \"[BlueBook Unknowns PDF](https://github.com/richgel999/uap_resources/blob/main/bluebook_uncensored_unknowns_don_berliner.pdf)\",\n");
|
||
fprintf(pOut_file, " \"type\" : \"ufo sighting\"\n");
|
||
fprintf(pOut_file, "}");
|
||
if (cur_line < lines.size())
|
||
fprintf(pOut_file, ",");
|
||
fprintf(pOut_file, "\n");
|
||
|
||
total_recs++;
|
||
}
|
||
}
|
||
|
||
fprintf(pOut_file, "] }\n");
|
||
fclose(pOut_file);
|
||
|
||
return true;
|
||
}
|
||
|
||
static std::string convert_hall_to_json_date(const std::string& date_str)
|
||
{
|
||
std::string json_date;
|
||
|
||
if ((string_begins_with(date_str, "Spring")) ||
|
||
(string_begins_with(date_str, "Summer")) ||
|
||
(string_begins_with(date_str, "Late")) ||
|
||
(string_begins_with(date_str, "Early")))
|
||
{
|
||
json_date = date_str;
|
||
}
|
||
else
|
||
{
|
||
size_t date_space = date_str.find_first_of(' ');
|
||
if (date_space == std::string::npos)
|
||
panic("Unable to find space char");
|
||
|
||
size_t date_comma = date_str.find_first_of(',');
|
||
|
||
int month;
|
||
|
||
if (string_begins_with(date_str, "Sept."))
|
||
month = 8;
|
||
else
|
||
{
|
||
for (month = 0; month < 12; month++)
|
||
{
|
||
if (string_begins_with(date_str, g_months[month]))
|
||
break;
|
||
if (string_begins_with(date_str, g_full_months[month]))
|
||
break;
|
||
}
|
||
|
||
if (month == 12)
|
||
panic("Failed finding month");
|
||
}
|
||
|
||
char buf[256];
|
||
|
||
int day, year;
|
||
if (date_comma == std::string::npos)
|
||
{
|
||
year = atoi(date_str.c_str() + date_space + 1);
|
||
if ((year < 1900) || (year > 2000))
|
||
panic("Invalid year");
|
||
|
||
sprintf_s(buf, "%u/%u", month + 1, year);
|
||
}
|
||
else
|
||
{
|
||
day = atoi(date_str.c_str() + date_space + 1);
|
||
if ((day < 1) || (day > 31))
|
||
panic("Invalid day");
|
||
|
||
year = atoi(date_str.c_str() + date_comma + 1);
|
||
if ((year < 1900) || (year > 2000))
|
||
panic("Invalid year");
|
||
|
||
sprintf_s(buf, "%u/%u/%u", month + 1, day, year);
|
||
}
|
||
|
||
json_date = buf;
|
||
}
|
||
return json_date;
|
||
}
|
||
|
||
bool convert_hall()
|
||
{
|
||
string_vec lines;
|
||
|
||
if (!read_text_file("ufo_evidence_hall.txt", lines, true, nullptr))
|
||
panic("Can't read file ufo_evidence_hall.txt");
|
||
|
||
uint32_t cur_line = 0;
|
||
|
||
uint32_t total_recs = 0;
|
||
|
||
FILE* pOut_file = ufopen("ufo_evidence_hall.json", "w");
|
||
if (!pOut_file)
|
||
panic("Can't open output file ufo_evidence_hall.json");
|
||
|
||
fputc(UTF8_BOM0, pOut_file);
|
||
fputc(UTF8_BOM1, pOut_file);
|
||
fputc(UTF8_BOM2, pOut_file);
|
||
|
||
fprintf(pOut_file, "{\n");
|
||
fprintf(pOut_file, "\"UFOEvidenceHall Timeline\" : [\n");
|
||
|
||
const uint32_t MAX_RECS = 2000;
|
||
|
||
while (cur_line < lines.size())
|
||
{
|
||
std::string rec(lines[cur_line]);
|
||
cur_line++;
|
||
|
||
string_trim(rec);
|
||
if (rec.empty())
|
||
panic("Encountered empty line");
|
||
|
||
size_t first_semi_ofs = rec.find_first_of(';');
|
||
if (first_semi_ofs == std::string::npos)
|
||
panic("Can't first first semi");
|
||
|
||
size_t second_semi_ofs = rec.find_first_of(';');
|
||
if (second_semi_ofs == std::string::npos)
|
||
panic("Can't find second semi");
|
||
|
||
while (cur_line < lines.size())
|
||
{
|
||
std::string l(lines[cur_line]);
|
||
string_trim(l);
|
||
|
||
if (!l.size())
|
||
panic("Encountered empty line");
|
||
|
||
size_t semi_ofs = l.find_first_of(';');
|
||
if (semi_ofs != std::string::npos)
|
||
break;
|
||
|
||
cur_line++;
|
||
|
||
if (rec.size() && rec.back() == '-')
|
||
rec += l;
|
||
else
|
||
{
|
||
rec += ' ';
|
||
rec += l;
|
||
}
|
||
}
|
||
|
||
if (rec.size())
|
||
{
|
||
//printf("%u. %s\n", total_recs + 1, rec.c_str());
|
||
|
||
std::string date_str(rec);
|
||
date_str.resize(first_semi_ofs);
|
||
string_trim(date_str);
|
||
|
||
rec.erase(0, first_semi_ofs + 1);
|
||
string_trim(rec);
|
||
|
||
second_semi_ofs = rec.find_first_of(';');
|
||
if (second_semi_ofs == std::string::npos)
|
||
panic("Can't find second semi");
|
||
|
||
std::string location_str(rec);
|
||
location_str.resize(second_semi_ofs);
|
||
string_trim(location_str);
|
||
if (location_str.size() && location_str.back() == '.')
|
||
location_str.pop_back();
|
||
|
||
rec.erase(0, second_semi_ofs + 1);
|
||
string_trim(rec);
|
||
|
||
//uprintf("Rec: %u\n", total_recs + 1);
|
||
//uprintf("Date: %s\n", date_str.c_str());
|
||
//uprintf("Location: %s\n", location_str.c_str());
|
||
//uprintf("Desc: %s\n", rec.c_str());
|
||
|
||
std::string json_date(convert_hall_to_json_date(date_str));
|
||
std::string json_end_date;
|
||
|
||
//uprintf("JSON Date: %s\n", date_str.c_str());
|
||
//uprintf("Date: %s\n", json_date.c_str());
|
||
|
||
if (string_begins_with(rec, "to "))
|
||
{
|
||
size_t dot_ofs = rec.find_first_of('.');
|
||
|
||
if (dot_ofs == std::string::npos)
|
||
panic("Invalid to date");
|
||
|
||
std::string to_date(rec);
|
||
to_date.resize(dot_ofs);
|
||
to_date.erase(0, 3);
|
||
string_trim(to_date);
|
||
|
||
rec.erase(0, dot_ofs + 1);
|
||
string_trim(rec);
|
||
|
||
json_end_date = convert_hall_to_json_date(to_date);
|
||
}
|
||
|
||
size_t k = rec.find_last_of('(');
|
||
if (k != std::string::npos)
|
||
{
|
||
if ((string_begins_with(rec.c_str() + k, "(Section ")) ||
|
||
(string_begins_with(rec.c_str() + k, "(section ")) ||
|
||
(string_begins_with(rec.c_str() + k, "(Sections ")) ||
|
||
(string_begins_with(rec.c_str() + k, "(sections ")))
|
||
{
|
||
rec.erase(k);
|
||
string_trim(rec);
|
||
}
|
||
}
|
||
|
||
#if USE_OPENAI
|
||
std::string prompt_str("Best categorize the following quoted text into one or more of these categories as a json array: sighting, landing, natural phenomenom, newspaper article, report or memo, official, abduction, medical, occupant or alien or creature encounter, or historical event: \"");
|
||
|
||
prompt_str += rec;
|
||
prompt_str += "\"";
|
||
|
||
std::string type_str;
|
||
bool status = invoke_openai(prompt_str, type_str);
|
||
if (!status)
|
||
panic("invoke_openai failed!\n");
|
||
|
||
for (size_t i = 0; i < type_str.size(); i++)
|
||
{
|
||
uint8_t c = type_str[i];
|
||
|
||
if ((c >= 32) && (c <= 127))
|
||
type_str[i] = (char)tolower(c);
|
||
}
|
||
#else
|
||
std::string type_str("[\"sighting\"]");
|
||
#endif
|
||
|
||
fprintf(pOut_file, "{\n");
|
||
fprintf(pOut_file, " \"date\" : \"%s\",\n", json_date.c_str());
|
||
if (json_end_date.size())
|
||
fprintf(pOut_file, " \"end_date\" : \"%s\",\n", json_end_date.c_str());
|
||
fprintf(pOut_file, " \"location\" : \"%s\",\n", escape_string_for_json(location_str).c_str());
|
||
fprintf(pOut_file, " \"desc\" : \"%s\",\n", escape_string_for_json(rec).c_str());
|
||
fprintf(pOut_file, " \"source_id\" : \"HallUFOE2_%u\",\n", total_recs);
|
||
fprintf(pOut_file, " \"source\" : \"HallUFOEvidence2\",\n");
|
||
fprintf(pOut_file, " \"ref\" : \"[The UFO Evidence by Richard H. Hall](https://www.amazon.com/UFO-Evidence-Richard-Hall/dp/0760706271)\",\n");
|
||
if (type_str.size())
|
||
fprintf(pOut_file, " \"type\" : %s\n", type_str.c_str());
|
||
|
||
fprintf(pOut_file, "}");
|
||
if (cur_line < lines.size())
|
||
fprintf(pOut_file, ",");
|
||
fprintf(pOut_file, "\n");
|
||
|
||
#if USE_OPENAI
|
||
fflush(pOut_file);
|
||
#endif
|
||
|
||
total_recs++;
|
||
if (total_recs == MAX_RECS)
|
||
break;
|
||
}
|
||
}
|
||
|
||
fprintf(pOut_file, "] }\n");
|
||
fclose(pOut_file);
|
||
|
||
return true;
|
||
}
|
||
|
||
static const char* g_bad_urls[] =
|
||
{
|
||
"https://www.thelivingmoon.com/41pegasus/12insiders/McDonnell%5FDouglas%5FUFO%5FStudies.html",
|
||
"https://ufo.com.br/artigos/relatos---piloto-se-arrisca-em-prova-de-fogo-no-parana.html",
|
||
"http://boblazardebunked.com/investigating-s4-e115/",
|
||
"http://boblazardebunked.com/real-area-51-tech/",
|
||
"https://history.nebraska.gov/sites/history.nebraska.gov/files/doc/publications/NH1979UFOs.pdf",
|
||
"http://alienhunter.org/about/becoming-the-alien-hunter/",
|
||
"http://brumac.8k.com/ChristmasTree/",
|
||
"http://dl.lilibook.ir/2016/03/Other-Tongues-Other-Flesh.pdf",
|
||
"http://documents.irevues.inist.fr/bitstream/handle/2042/51980/meteo%5F1995%5F11%5F8.pdf",
|
||
"http://files.afu.se/Downloads/Magazines/United%20Kingdom/%20Merseyside%20UFO%20Bulletin/Merseyside%20UFO%20Bulletin%20-%20Vol%201%20No%201%20-%201968.pdf",
|
||
"http://ufologie.patrickgross.org/1954/19oct1954criteuil.htm",
|
||
"http://www.bluebookarchive.org/page.aspx?PageCode=NARA-PBB85-812",
|
||
"http://www.bluebookarchive.org/page.aspx?PageCode=NARA-PBB85-813",
|
||
"http://www.bluebookarchive.org/page.aspx?PageCode=NARA-PBB85-816",
|
||
"http://www.bluebookarchive.org/page.aspx?PageCode=NARA-PBB92-607",
|
||
"http://www.bluebookarchive.org/page.aspx?pagecode=NARA-PBB90-354",
|
||
"http://www.cheniere.org/",
|
||
"http://www.historycommons.org/entity.jsp?entity=james%5Fo%5F%5Fconnell",
|
||
"http://www.islandone.org/LEOBiblio/SETI1.HTM",
|
||
"http://www.nicap.org/france74.gif",
|
||
"http://www.phils.com.au/hollanda.htm",
|
||
"http://www.spellconsulting.com/reality/Norway%5FSpiral.html",
|
||
"http://www.wsmr-history.org/HallOfFame52.htm",
|
||
"http://www.zanoverallsongs.com/bio/",
|
||
"https://area51specialprojects.com/area51%5Ftimeline.html",
|
||
"https://area51specialprojects.com/u2%5Fpilots.html",
|
||
"https://cieloinsolito.com/?p=724",
|
||
"https://cieloinsolito.com/wp-content/uploads/2019/09/heretheyare.pdf",
|
||
"https://clubdeleonescuernavacaac.club/1965-1966-joaquin-diaz-gonzalez/",
|
||
"https://dpo.tothestarsacademy.com/blog/crada-faq",
|
||
"https://files.afu.se/Downloads/Magazines/Switzerland/Weltraumbote/Weltraumbote%20-%20No%2001%20-%201955-1956.pdf",
|
||
"https://files.afu.se/Downloads/UFO%20reports/Scandinavia/GR%20translations/46-02-21%20Finland%20meteor/1946-02-21%20Report%20Meteor%20over%20Finland%20and%20east%20Sweden.docx",
|
||
"https://mauriziobaiata.net/2011/11/10/intervista-ad-eugenio-siragusa-1919-2006-la-verita-non-si-vende-e-non-si-compra/",
|
||
"https://psi-encyclopedia.spr.ac.uk/articles/helene-smith",
|
||
"https://trinitysecret.com/the-other-lessons-of-trinity/",
|
||
"https://ufologie.patrickgross.org/1954/17oct1954cier.htm",
|
||
"https://ufologie.patrickgross.org/1954/1oct1954bry.htm",
|
||
"https://ufologie.patrickgross.org/1954/4oct1954poncey.htm",
|
||
"https://ufoscoop.com/richard-c-doty/",
|
||
"https://worldhistoryproject.org/1948/12/20/project-twinkle-established-to-monitor-green-fireball-sightings",
|
||
"https://www.abqjournal.com/obits/profiles/0722057profiles11-07-09.htm",
|
||
"https://www.alternatewars.com/BBOW/ABC%5FWeapons/US%5FNuclear%5FStockpile.htm",
|
||
"https://www.amberley-books.com/community-james-p-templeton",
|
||
"https://www.barnesandnoble.com/w/creative-realism-rolf-alexander/1132618455",
|
||
"https://www.charlotteobserver.com/latest-news/article125658529.html",
|
||
"https://www.hotspotsz.com/former-reporter-recounts-ufo-tale/",
|
||
"https://www.modrall.com/attorney/r-e-thompson/",
|
||
"https://www.mondenouveau.fr/presence-extraterrestre-ummo-une-imposture-ou-pas/",
|
||
"https://www.ourstrangeplanet.com/the-san-luis-valley/guest-editorials/mad-cow-disease-and-cattle-mutilations/",
|
||
"https://www.realclearscience.com/articles/2017/12/02/how%5F17th%5Fcentury%5Fdreamers%5Fplanned%5Fto%5Freach%5Fthe%5Fmoon%5F110476.html",
|
||
"https://www.rhun.co.nz/files/cia/cia1/44%5Fcia%5Fciaall2.pdf",
|
||
"https://www.spacelegalissues.com/the-french-anti-ufo-municipal-law-of-1954/",
|
||
"https://www.thevoicebeforethevoid.net/incidentcomplaint-report-by-commander-44-missile-security-squadron-ellsworth-air-force-base-south-dakota/",
|
||
"https://www.uapsg.com/2020/07/argentina-ufo-declassification.html",
|
||
"https://www.webcitation.org/6mx4huFGk",
|
||
"https://www.webcitation.org/6mx4rfU20",
|
||
"https://www.webcitation.org/6mx5Youbh",
|
||
"https://zazenlife.com/2011/12/18/project-mk-ultra-the-c-i-as-experiments-with-mind-control/"
|
||
};
|
||
|
||
const uint32_t NUM_BAD_URLS = sizeof(g_bad_urls) / sizeof(g_bad_urls[0]);
|
||
|
||
static std::string fix_bar_urls(const std::string& url)
|
||
{
|
||
for (uint32_t i = 0; i < NUM_BAD_URLS; i++)
|
||
if (url == g_bad_urls[i])
|
||
return "https://web.archive.org/web/100/" + url;
|
||
|
||
return url;
|
||
}
|
||
|
||
bool convert_eberhart(unordered_string_set& unique_urls)
|
||
{
|
||
string_vec lines;
|
||
|
||
if (!read_text_file("ufo1_199.md", lines, true, nullptr))
|
||
panic("Can't read file ufo_evidence_hall.txt");
|
||
|
||
if (!read_text_file("ufo200_399.md", lines, true, nullptr))
|
||
panic("Can't read file ufo_evidence_hall.txt");
|
||
|
||
if (!read_text_file("ufo400_599.md", lines, true, nullptr))
|
||
panic("Can't read file ufo_evidence_hall.txt");
|
||
|
||
if (!read_text_file("ufo600_906_1.md", lines, true, nullptr))
|
||
panic("Can't read file ufo_evidence_hall.txt");
|
||
|
||
if (!read_text_file("ufo600_906_2.md", lines, true, nullptr))
|
||
panic("Can't read file ufo_evidence_hall.txt");
|
||
|
||
json eberhart_openai;
|
||
bool utf8_flag;
|
||
if (!load_json_object("eberhart_openai.json", utf8_flag, eberhart_openai))
|
||
panic("Failed loading eberhart_openai.json");
|
||
|
||
if (eberhart_openai.find("results") == eberhart_openai.end())
|
||
panic("Couldn't find results");
|
||
|
||
const auto& openai_res = eberhart_openai["results"];
|
||
if (!openai_res.is_array())
|
||
panic("Couldn't find results");
|
||
|
||
std::unordered_map<uint32_t, std::vector<uint32_t> > openai_res_hash;
|
||
|
||
for (uint32_t l = 0; l < openai_res.size(); l++)
|
||
{
|
||
const auto& rec = openai_res[l];
|
||
if (!rec.contains("event_crc32") || !rec.contains("event_date_str") || !rec.contains("locations"))
|
||
panic("Invalid OpenAI JSON data");
|
||
|
||
std::vector<uint32_t> list;
|
||
list.push_back(l);
|
||
|
||
auto res = openai_res_hash.insert(std::make_pair(rec["event_crc32"].get<uint32_t>(), list));
|
||
if (!res.second)
|
||
(res.first)->second.push_back(l);
|
||
}
|
||
|
||
string_vec useful_locs;
|
||
if (!read_text_file("eberhart_useful_locations.txt", useful_locs, true, nullptr))
|
||
panic("failed reading eberhart_useful_locations");
|
||
|
||
unordered_string_set useful_locs_set;
|
||
for (const auto& str : useful_locs)
|
||
useful_locs_set.insert(str);
|
||
|
||
string_vec trimmed_lines;
|
||
for (uint32_t i = 0; i < lines.size(); i++)
|
||
{
|
||
std::string s(lines[i]);
|
||
string_trim(s);
|
||
if (s.size())
|
||
trimmed_lines.push_back(s);
|
||
}
|
||
lines.swap(trimmed_lines);
|
||
|
||
uint32_t cur_line = 0;
|
||
|
||
string_vec new_lines;
|
||
while (cur_line < lines.size())
|
||
{
|
||
std::string line(lines[cur_line]);
|
||
|
||
bool all_digits = true;
|
||
for (uint32_t i = 0; i < line.size(); i++)
|
||
{
|
||
if (!isdigit((uint8_t)line[i]))
|
||
{
|
||
all_digits = false;
|
||
break;
|
||
}
|
||
}
|
||
|
||
if (line == "* * *")
|
||
{
|
||
}
|
||
else if (all_digits && ((line.size() == 3) || (line.size() == 4)))
|
||
{
|
||
char buf[256];
|
||
sprintf_s(buf, "# %s", line.c_str());
|
||
new_lines.push_back(buf);
|
||
}
|
||
else if (line == "#")
|
||
{
|
||
if ((cur_line + 1) >= lines.size())
|
||
panic("Out of lines");
|
||
|
||
if (!isdigit((char)lines[cur_line + 1][0]))
|
||
panic("Can't fine year");
|
||
|
||
char buf[256];
|
||
sprintf_s(buf, "# %s", lines[cur_line + 1].c_str());
|
||
new_lines.push_back(buf);
|
||
cur_line++;
|
||
}
|
||
else if ((line.size() >= 5) && (line[0] == '*') && (line[1] == '*'))
|
||
{
|
||
uint32_t year = atoi(line.c_str() + 2);
|
||
|
||
if ((year < 800) || (year > 2050))
|
||
panic("Invalid year");
|
||
|
||
char buf[256];
|
||
sprintf_s(buf, "# %u", year);
|
||
new_lines.push_back(buf);
|
||
}
|
||
else
|
||
new_lines.push_back(line);
|
||
|
||
cur_line++;
|
||
}
|
||
lines.swap(new_lines);
|
||
|
||
//write_text_file("temp.txt", lines);
|
||
|
||
FILE* pOut_file = ufopen("eberhart.json", "w");
|
||
if (!pOut_file)
|
||
panic("Can't open output file eberhart.json");
|
||
|
||
fputc(UTF8_BOM0, pOut_file);
|
||
fputc(UTF8_BOM1, pOut_file);
|
||
fputc(UTF8_BOM2, pOut_file);
|
||
|
||
fprintf(pOut_file, "{\n");
|
||
fprintf(pOut_file, "\"Eberhart Timeline\" : [\n");
|
||
|
||
int cur_year = -1;
|
||
cur_line = 0;
|
||
|
||
uint32_t event_num = 0, total_unattributed = 0;
|
||
uint32_t total_openai_recs_found = 0;
|
||
|
||
string_vec location_strs;
|
||
|
||
while (cur_line < lines.size())
|
||
{
|
||
std::string line(lines[cur_line]);
|
||
|
||
cur_line++;
|
||
|
||
if (!line.size())
|
||
continue;
|
||
|
||
if (line[0] == '#')
|
||
{
|
||
int year = atoi(line.c_str() + 1);
|
||
if ((year < 100) || (year > 2050))
|
||
panic("Invalid year");
|
||
|
||
cur_year = year;
|
||
|
||
continue;
|
||
}
|
||
|
||
size_t dash_pos = line.find(u8"<EFBFBD>");
|
||
if (dash_pos == std::string::npos)
|
||
panic("Failed finding dash\n");
|
||
|
||
std::string date(line);
|
||
date.erase(dash_pos, date.size());
|
||
string_trim(date);
|
||
|
||
if (!date.size())
|
||
panic("Date too small");
|
||
|
||
line.erase(0, dash_pos + 3);
|
||
string_trim(line);
|
||
|
||
if (!line.size())
|
||
panic("Line too small");
|
||
|
||
event_date begin_date;
|
||
event_date end_date;
|
||
event_date alt_date;
|
||
bool year_status = event_date::parse_eberhart_date_range(date, begin_date, end_date, alt_date, cur_year);
|
||
if (!year_status)
|
||
panic("Date parse failed");
|
||
|
||
std::string desc(line);
|
||
while (cur_line < lines.size())
|
||
{
|
||
std::string temp(lines[cur_line]);
|
||
|
||
if (temp[0] == '#')
|
||
break;
|
||
|
||
size_t d = temp.find(u8"<EFBFBD>");
|
||
|
||
const uint32_t DASH_THRESH_POS = 42;
|
||
if ((d != std::string::npos) && (d < DASH_THRESH_POS))
|
||
{
|
||
std::string temp_date(temp);
|
||
temp_date.erase(d, temp_date.size());
|
||
string_trim(temp_date);
|
||
|
||
event_date b, e, a;
|
||
bool spec_year_status = event_date::parse_eberhart_date_range(temp_date, b, e, a, cur_year);
|
||
if (spec_year_status)
|
||
break;
|
||
}
|
||
|
||
if ((desc.size()) && (desc.back() != '-'))
|
||
desc += ' ';
|
||
|
||
if (!lines[cur_line].size())
|
||
panic("Unexpected empty line");
|
||
|
||
desc += lines[cur_line];
|
||
cur_line++;
|
||
}
|
||
|
||
std::string ref;
|
||
|
||
{
|
||
markdown_text_processor mt;
|
||
mt.init_from_markdown(desc.c_str());
|
||
|
||
mt.fix_redirect_urls();
|
||
|
||
for (uint32_t i = 0; i < mt.m_links.size(); i++)
|
||
unique_urls.insert(mt.m_links[i]);
|
||
|
||
for (auto& str : mt.m_links)
|
||
str = fix_bar_urls(str);
|
||
|
||
markdown_text_processor a, b;
|
||
bool status = mt.split_last_parens(a, b);
|
||
|
||
if (!status)
|
||
{
|
||
uprintf("Failing text: %s\n", desc.c_str());
|
||
|
||
panic("Unable to find attribution in record %u", event_num);
|
||
}
|
||
|
||
std::string a_md, b_md;
|
||
|
||
desc.resize(0);
|
||
a.convert_to_markdown(desc, true);
|
||
|
||
b.convert_to_markdown(ref, true);
|
||
|
||
if (ref == "\\(\\)")
|
||
{
|
||
ref.clear();
|
||
total_unattributed++;
|
||
}
|
||
else if (ref.size() >= 4)
|
||
{
|
||
if ((ref[0] == '\\') &&
|
||
(ref[1] == '(') &&
|
||
(ref.back() == ')') &&
|
||
(ref[ref.size() - 2] == '\\')
|
||
)
|
||
{
|
||
ref.erase(0, 2);
|
||
|
||
ref.pop_back();
|
||
ref.pop_back();
|
||
}
|
||
}
|
||
}
|
||
|
||
#if 0
|
||
uprintf("## Event %u\n", event_num);
|
||
|
||
uprintf("Date: %s \n", date.c_str());
|
||
|
||
uprintf("(%i %i/%i/%i)-(%i %i,%i,%i) alt: (%i %i,%i,%i) \n",
|
||
begin_date.m_prefix, begin_date.m_month, begin_date.m_day, begin_date.m_year,
|
||
end_date.m_prefix, end_date.m_month, end_date.m_day, end_date.m_year,
|
||
alt_date.m_prefix, alt_date.m_month, alt_date.m_day, alt_date.m_year);
|
||
|
||
uprintf("%s \n\n", desc.c_str());
|
||
#endif
|
||
fprintf(pOut_file, "{\n");
|
||
|
||
std::string json_date(begin_date.get_string()), json_end_date(end_date.get_string()), json_alt_date(alt_date.get_string());
|
||
|
||
fprintf(pOut_file, " \"date\" : \"%s\",\n", json_date.c_str());
|
||
|
||
if (json_end_date.size())
|
||
fprintf(pOut_file, " \"end_date\" : \"%s\",\n", json_end_date.c_str());
|
||
|
||
if (json_alt_date.size())
|
||
fprintf(pOut_file, " \"alt_date\" : \"%s\",\n", json_alt_date.c_str());
|
||
|
||
fprintf(pOut_file, " \"desc\" : \"%s\",\n", escape_string_for_json(desc).c_str());
|
||
fprintf(pOut_file, " \"source_id\" : \"Eberhart_%u\",\n", event_num);
|
||
|
||
fprintf(pOut_file, " \"source\" : \"EberhartUFOI\",\n");
|
||
|
||
uint32_t hash = crc32((const uint8_t*)desc.c_str(), desc.size());
|
||
hash = crc32((const uint8_t*)&begin_date.m_year, sizeof(begin_date.m_year), hash);
|
||
hash = crc32((const uint8_t*)&begin_date.m_month, sizeof(begin_date.m_month), hash);
|
||
hash = crc32((const uint8_t*)&begin_date.m_day, sizeof(begin_date.m_day), hash);
|
||
|
||
auto find_res = openai_res_hash.find(hash);
|
||
if (find_res != openai_res_hash.end())
|
||
{
|
||
const std::vector<uint32_t>& list = find_res->second;
|
||
|
||
for (uint32_t l = 0; l < list.size(); l++)
|
||
{
|
||
const uint32_t rec_index = list[l];
|
||
|
||
const auto& rec = openai_res[rec_index];
|
||
|
||
if (!rec.contains("event_crc32") || !rec.contains("event_date_str") || !rec.contains("locations"))
|
||
panic("Invalid OpenAI JSON data");
|
||
|
||
if (rec["event_crc32"] != hash)
|
||
panic("hash failed");
|
||
|
||
if (rec["event_date_str"] != json_date)
|
||
continue;
|
||
|
||
const auto& loc = rec["locations"];
|
||
if (loc.size())
|
||
{
|
||
uint32_t total_useful_locs = 0;
|
||
for (uint32_t k = 0; k < loc.size(); k++)
|
||
{
|
||
if (useful_locs_set.find(loc[k]) != useful_locs_set.end())
|
||
total_useful_locs++;
|
||
}
|
||
|
||
if (total_useful_locs)
|
||
{
|
||
fprintf(pOut_file, " \"location\" : [ ");
|
||
|
||
uint32_t total_useful_locs_printed = 0;
|
||
|
||
for (uint32_t k = 0; k < loc.size(); k++)
|
||
{
|
||
if (useful_locs_set.find(loc[k]) != useful_locs_set.end())
|
||
{
|
||
if (total_useful_locs_printed)
|
||
fprintf(pOut_file, ", ");
|
||
|
||
fprintf(pOut_file, "\"%s\"", escape_string_for_json(loc[k]).c_str());
|
||
|
||
total_useful_locs_printed++;
|
||
}
|
||
else
|
||
{
|
||
location_strs.push_back(loc[k]);
|
||
}
|
||
}
|
||
|
||
fprintf(pOut_file, " ],\n");
|
||
}
|
||
}
|
||
|
||
total_openai_recs_found++;
|
||
break;
|
||
}
|
||
}
|
||
|
||
if (!ref.size())
|
||
{
|
||
fprintf(pOut_file, " \"ref\" : \"[Eberhart](http://www.cufos.org/pdfs/UFOsandIntelligence.pdf)\"\n");
|
||
}
|
||
else
|
||
{
|
||
fprintf(pOut_file, " \"ref\" : [ \"[Eberhart](http://www.cufos.org/pdfs/UFOsandIntelligence.pdf)\", \"%s\" ]\n", escape_string_for_json(ref).c_str());
|
||
}
|
||
|
||
fprintf(pOut_file, "}");
|
||
|
||
if (cur_line < lines.size())
|
||
fprintf(pOut_file, ",");
|
||
|
||
fprintf(pOut_file, "\n");
|
||
|
||
event_num++;
|
||
}
|
||
|
||
write_text_file("rejected_location_strs.txt", location_strs, true);
|
||
|
||
fprintf(pOut_file, "] }\n");
|
||
fclose(pOut_file);
|
||
|
||
uprintf("Total records: %u\n", event_num);
|
||
uprintf("Total unattributed: %u\n", total_unattributed);
|
||
uprintf("Total OpenAI recs found: %u\n", total_openai_recs_found);
|
||
|
||
return true;
|
||
}
|
||
|
||
bool convert_johnson()
|
||
{
|
||
static const uint32_t days_in_month[] = { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 };
|
||
|
||
FILE* pCombined_file = nullptr;
|
||
|
||
#if 0
|
||
pCombined_file = ufopen("combined.txt", "wb");
|
||
if (!pCombined_file)
|
||
panic("Can't open combined.txt");
|
||
|
||
fputc(UTF8_BOM0, pCombined_file);
|
||
fputc(UTF8_BOM1, pCombined_file);
|
||
fputc(UTF8_BOM2, pCombined_file);
|
||
#endif
|
||
|
||
FILE* pOut_file = ufopen("johnson.json", "w");
|
||
if (!pOut_file)
|
||
panic("Can't open output file johnson.json");
|
||
|
||
fputc(UTF8_BOM0, pOut_file);
|
||
fputc(UTF8_BOM1, pOut_file);
|
||
fputc(UTF8_BOM2, pOut_file);
|
||
|
||
fprintf(pOut_file, "{\n");
|
||
fprintf(pOut_file, "\"Johnson Timeline\" : [\n");
|
||
|
||
string_vec combined_lines;
|
||
|
||
uint32_t total_recs = 0;
|
||
|
||
for (uint32_t month = 0; month < 12; month++)
|
||
{
|
||
for (uint32_t day = 0; day < days_in_month[month]; day++)
|
||
{
|
||
char buf[256];
|
||
sprintf_s(buf, "Johnson/%s%02u.txt", g_full_months[month], day + 1);
|
||
|
||
string_vec lines;
|
||
|
||
bool utf8_flag = false;
|
||
bool status = read_text_file(buf, lines, true, &utf8_flag);
|
||
if (!status)
|
||
panic("Can't open file %s\n", buf);
|
||
|
||
uprintf("Read file %s %u\n", buf, utf8_flag);
|
||
|
||
string_vec filtered_lines;
|
||
bool found_end = false;
|
||
for (uint32_t line_index = 0; line_index < lines.size(); line_index++)
|
||
{
|
||
if ((lines[line_index].size() >= 2) && (lines[line_index][0] == '|') && (lines[line_index].back() == '|'))
|
||
{
|
||
std::string l(lines[line_index]);
|
||
|
||
l.erase(0, 1);
|
||
l.pop_back();
|
||
|
||
// Convert non-break spaces to spaces
|
||
std::string new_l;
|
||
for (uint32_t i = 0; i < l.size(); i++)
|
||
{
|
||
uint8_t c = (uint8_t)l[i];
|
||
if ((c == 0xC2) && ((i + 1) < l.size()) && ((uint8_t)l[i + 1] == 0xA0))
|
||
{
|
||
new_l.push_back(' ');
|
||
i++;
|
||
}
|
||
// EN DASH
|
||
else if ((c == 0xE2) && ((i + 2) < l.size()) && ((uint8_t)l[i + 1] == 0x80) && ((uint8_t)l[i + 2] == 0x93))
|
||
{
|
||
new_l.push_back('-');
|
||
i += 2;
|
||
}
|
||
else
|
||
new_l.push_back(l[i]);
|
||
}
|
||
|
||
l.swap(new_l);
|
||
|
||
string_trim(l);
|
||
|
||
// See if we're at the end of the page
|
||
if ((string_find_first(l, "Written by Donald A. Johnson") != -1) ||
|
||
(string_find_first(l, "Written by Donald Johnson") != -1) ||
|
||
(string_find_first(l, "Written by Donald A Johnson") != -1) ||
|
||
(string_find_first(l, "Compiled from the UFOCAT computer database") != -1) ||
|
||
(string_find_first(l, u8"<EFBFBD> Donald A. Johnson") != -1) ||
|
||
(string_begins_with(l, "Themes: ")))
|
||
{
|
||
found_end = true;
|
||
break;
|
||
}
|
||
|
||
filtered_lines.push_back(l);
|
||
}
|
||
}
|
||
lines.swap(filtered_lines);
|
||
|
||
if (!found_end)
|
||
panic("Couldn't find end");
|
||
|
||
if (lines.size() < 6)
|
||
panic("File too small");
|
||
|
||
if ((lines[0] != "[On This Day]") || (string_find_first(lines[1], "Encounters with Aliens on this Day") == -1))
|
||
panic("Couldn't find beginning");
|
||
|
||
lines.erase(lines.begin(), lines.begin() + 5);
|
||
|
||
//for (uint32_t i = 0; i < lines.size(); i++)
|
||
// uprintf("%04u: \"%s\"\n", i, lines[i].c_str());
|
||
//uprintf("\n");
|
||
|
||
string_vec new_lines;
|
||
|
||
for (uint32_t i = 0; i < lines.size(); i++)
|
||
{
|
||
if (lines[i].size() &&
|
||
(lines[i].back() == ']') &&
|
||
((string_find_first(lines[i], "[") == -1) || (lines[i].front() == '[')))
|
||
{
|
||
int back_delta_pos = -1;
|
||
|
||
for (int back_delta = 0; back_delta <= 3; back_delta++)
|
||
{
|
||
if ((int)i < back_delta)
|
||
break;
|
||
if (lines[i - back_delta].size() && lines[i - back_delta].front() == '[')
|
||
{
|
||
back_delta_pos = i - back_delta;
|
||
break;
|
||
}
|
||
}
|
||
|
||
bool find_extra_text = true;
|
||
|
||
if (back_delta_pos == -1)
|
||
{
|
||
for (int back_delta = 0; back_delta <= 3; back_delta++)
|
||
{
|
||
if ((int)i < back_delta)
|
||
break;
|
||
if (string_find_first(lines[i - back_delta], "[") != -1)
|
||
{
|
||
back_delta_pos = i - back_delta;
|
||
break;
|
||
}
|
||
}
|
||
|
||
if (back_delta_pos == -1)
|
||
panic("Can't find back delta");
|
||
find_extra_text = false;
|
||
}
|
||
|
||
if ((back_delta_pos < 2) || lines[back_delta_pos - 1].size())
|
||
{
|
||
new_lines.push_back(lines[i]);
|
||
continue;
|
||
}
|
||
|
||
if (find_extra_text)
|
||
{
|
||
back_delta_pos -= 2;
|
||
|
||
while (back_delta_pos && lines[back_delta_pos].size())
|
||
{
|
||
back_delta_pos--;
|
||
}
|
||
}
|
||
|
||
uint32_t total_lines_to_erase = (i + 1) - (back_delta_pos + 1);
|
||
|
||
//for (uint32_t j = 0; j < total_lines_to_erase; j++)
|
||
// uprintf("\"%s\"\n", new_lines[(new_lines.size() - total_lines_to_erase) + j].c_str());
|
||
|
||
new_lines.erase(new_lines.begin() + (new_lines.size() - total_lines_to_erase), new_lines.end());
|
||
}
|
||
else
|
||
{
|
||
new_lines.push_back(lines[i]);
|
||
}
|
||
}
|
||
|
||
lines.swap(new_lines);
|
||
|
||
#if 0
|
||
for (uint32_t i = 0; i < lines.size(); i++)
|
||
{
|
||
fwrite(lines[i].c_str(), lines[i].size(), 1, pCombined_file);
|
||
fwrite("\r\n", 2, 1, pCombined_file);
|
||
}
|
||
#endif
|
||
|
||
// Sanity checks
|
||
for (uint32_t i = 0; i < lines.size(); i++)
|
||
{
|
||
const std::string& line = lines[i];
|
||
|
||
if (line.size() < 7)
|
||
continue;
|
||
|
||
if (line.find_first_of('|') != std::string::npos)
|
||
{
|
||
panic("Bad line");
|
||
}
|
||
|
||
if (isdigit((uint8_t)line[0]) &&
|
||
isdigit((uint8_t)line[1]) &&
|
||
isdigit((uint8_t)line[2]) &&
|
||
isdigit((uint8_t)line[3]) &&
|
||
(line[4] == ',') && (i >= 1) && (lines[i - 1].size() == 0))
|
||
{
|
||
panic("Bad line");
|
||
}
|
||
|
||
if (string_begins_with(line, "In ") &&
|
||
(i >= 1) && (lines[i - 1].size() == 0))
|
||
{
|
||
panic("Bad line");
|
||
}
|
||
|
||
if ((string_begins_with(line, "On the ") ||
|
||
string_begins_with(line, "On this ") ||
|
||
string_begins_with(line, "That same ") ||
|
||
string_begins_with(line, "At dusk ") ||
|
||
string_begins_with(line, "At dawn ") ||
|
||
string_begins_with(line, "There were ") ||
|
||
string_begins_with(line, "A UFO was seen ") || string_begins_with(line, "An abduction occurred ") ||
|
||
string_begins_with(line, "Also in ") ||
|
||
string_begins_with(line, "There were a ") ||
|
||
(string_begins_with(line, "At ") && line.size() >= 4 && isdigit((uint8_t)line[3]))) &&
|
||
(i >= 1) && (lines[i - 1].size() == 0))
|
||
{
|
||
panic("Bad line");
|
||
}
|
||
|
||
if (isdigit((uint8_t)line[0]) &&
|
||
isdigit((uint8_t)line[1]) &&
|
||
isdigit((uint8_t)line[2]) &&
|
||
isdigit((uint8_t)line[3]) &&
|
||
(line[4] == ' ') &&
|
||
(line[5] == '-') &&
|
||
(line[6] != ' '))
|
||
{
|
||
panic("Bad line");
|
||
}
|
||
|
||
if (isdigit((uint8_t)line[0]) &&
|
||
isdigit((uint8_t)line[1]) &&
|
||
isdigit((uint8_t)line[2]) &&
|
||
isdigit((uint8_t)line[3]) &&
|
||
(line[4] == '-') &&
|
||
(line[5] == ' '))
|
||
{
|
||
panic("Bad line");
|
||
}
|
||
}
|
||
|
||
uint32_t cur_line = 0;
|
||
int prev_year = -1;
|
||
|
||
while (cur_line < lines.size())
|
||
{
|
||
std::string first_line = lines[cur_line];
|
||
if (first_line.size() < 8)
|
||
panic("First line in block too small");
|
||
|
||
if (!isdigit((uint8_t)first_line[0]) ||
|
||
!isdigit((uint8_t)first_line[1]) ||
|
||
!isdigit((uint8_t)first_line[2]) ||
|
||
!isdigit((uint8_t)first_line[3]) ||
|
||
(first_line[4] != ' ') ||
|
||
(first_line[5] != '-') ||
|
||
(first_line[6] != ' '))
|
||
{
|
||
panic("Bad begin block");
|
||
}
|
||
|
||
int year = atoi(first_line.c_str());
|
||
if ((year < 1000) || (year > 2020))
|
||
panic("Invalid year");
|
||
|
||
if (year < prev_year)
|
||
panic("Year went backwards");
|
||
|
||
prev_year = year;
|
||
|
||
first_line.erase(0, 7);
|
||
|
||
std::string record_text(first_line);
|
||
|
||
cur_line++;
|
||
|
||
while (cur_line < lines.size())
|
||
{
|
||
const std::string& next_line = lines[cur_line];
|
||
|
||
if (isdigit((uint8_t)next_line[0]) &&
|
||
isdigit((uint8_t)next_line[1]) &&
|
||
isdigit((uint8_t)next_line[2]) &&
|
||
isdigit((uint8_t)next_line[3]) &&
|
||
(next_line[4] == ' ') &&
|
||
(next_line[5] == '-') &&
|
||
(next_line[6] == ' '))
|
||
{
|
||
break;
|
||
}
|
||
|
||
if (record_text.size())
|
||
{
|
||
if (next_line.size() == 0)
|
||
record_text += "<br/><br/>";
|
||
else if (record_text.back() != '-')
|
||
record_text.push_back(' ');
|
||
}
|
||
|
||
record_text += next_line;
|
||
|
||
cur_line++;
|
||
}
|
||
|
||
if (string_ends_in(record_text, "<br/><br/>"))
|
||
{
|
||
record_text.erase(record_text.begin() + record_text.size() - 10, record_text.end());
|
||
}
|
||
|
||
string_trim(record_text);
|
||
|
||
if (pCombined_file)
|
||
{
|
||
char buf2[256];
|
||
sprintf_s(buf2, "Record %u, year %i:\r\n", total_recs, year);
|
||
fwrite(buf2, strlen(buf2), 1, pCombined_file);
|
||
|
||
fwrite(&record_text[0], record_text.size(), 1, pCombined_file);
|
||
fwrite("\r\n", 2, 1, pCombined_file);
|
||
}
|
||
|
||
std::string ref;
|
||
|
||
// Extract reference(s) at end of record.
|
||
size_t src_pos = record_text.find("(Source:");
|
||
if (src_pos == std::string::npos)
|
||
src_pos = record_text.find("(Sources:");
|
||
|
||
if ((src_pos != std::string::npos) &&
|
||
((record_text.back() == ')') || ((record_text.back() == '.') && (record_text[record_text.size() - 2] == ')')))
|
||
)
|
||
{
|
||
ref = record_text;
|
||
|
||
ref.erase(0, src_pos);
|
||
|
||
if (ref.back() == '.')
|
||
ref.pop_back();
|
||
assert(ref.back() == ')');
|
||
ref.pop_back();
|
||
|
||
assert(ref[0] == '(');
|
||
ref.erase(0, 1);
|
||
|
||
if (string_begins_with(ref, "Source:"))
|
||
ref.erase(0, 7);
|
||
else
|
||
ref.erase(0, 8);
|
||
|
||
string_trim(ref);
|
||
|
||
record_text.erase(src_pos, record_text.size() - src_pos);
|
||
}
|
||
else
|
||
{
|
||
//uprintf("%s\n\n", record_text.c_str());
|
||
}
|
||
|
||
if (total_recs)
|
||
fprintf(pOut_file, ",\n");
|
||
|
||
fprintf(pOut_file, "{\n");
|
||
|
||
char buf2[256];
|
||
sprintf_s(buf2, "%u/%u/%u", month + 1, day + 1, year);
|
||
fprintf(pOut_file, " \"date\" : \"%s\",\n", buf2);
|
||
|
||
fprintf(pOut_file, " \"desc\" : \"%s\",\n", escape_string_for_json(record_text).c_str());
|
||
|
||
fprintf(pOut_file, " \"source_id\" : \"Johnson_%u\",\n", total_recs);
|
||
|
||
fprintf(pOut_file, " \"source\" : \"Johnson\",\n");
|
||
|
||
if (ref.size())
|
||
{
|
||
fprintf(pOut_file, " \"ref\" : [ \"[Johnson](https://web.archive.org/web/http://www.ufoinfo.com/onthisday/calendar.html)\", \"%s\" ]\n",
|
||
escape_string_for_json(ref).c_str());
|
||
}
|
||
else
|
||
fprintf(pOut_file, " \"ref\" : \"[Johnson](https://web.archive.org/web/http://www.ufoinfo.com/onthisday/calendar.html)\"\n");
|
||
|
||
fprintf(pOut_file, "}");
|
||
|
||
total_recs++;
|
||
}
|
||
|
||
if (pCombined_file)
|
||
fflush(pCombined_file);
|
||
}
|
||
}
|
||
|
||
fprintf(pOut_file, "\n] }\n");
|
||
fclose(pOut_file);
|
||
|
||
printf("Total records: %u\n", total_recs);
|
||
|
||
if (pCombined_file)
|
||
fclose(pCombined_file);
|
||
|
||
return true;
|
||
}
|
||
|
||
static bool test_eberhart_date()
|
||
{
|
||
event_date b, e, a;
|
||
|
||
if (!event_date::parse_eberhart_date_range("Late summer", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cLateSummer && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("Early winter", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cEarlyWinter && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("Mid autumn", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cMidAutumn && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("Mid autumn 1970 (or Late summer)", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cMidAutumn && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cLateSummer && a.m_year == 1970 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("Mid-December", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cMiddleOf && b.m_year == 1970 && b.m_month == 12 && b.m_day == -1);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("December 16?", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 12 && b.m_day == 16 && b.m_fuzzy);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("Fall 1970", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cFall && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("January 27, 1970", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 1 && b.m_day == 27);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("January 1970", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 1 && b.m_day == -1);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("March-June", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 3 && b.m_day == -1);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == 1970 && e.m_month == 6 && e.m_day == -1);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("Summer or Fall", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cSummer && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cFall && a.m_year == 1970 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("Summer-Fall or Winter", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cSummer && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
|
||
assert(e.m_prefix == cFall && e.m_year == 1970 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cWinter && a.m_year == 1970 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("January 5 or March 6", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 1 && b.m_day == 5);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == 1970 && a.m_month == 3 && a.m_day == 6);
|
||
|
||
if (!event_date::parse_eberhart_date_range("Late January 1970-Summer 1971", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cLate && b.m_year == 1970 && b.m_month == 1 && b.m_day == -1);
|
||
assert(e.m_prefix == cSummer && e.m_year == 1971 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("January 5 or 6", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 1 && b.m_day == 5);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == 1970 && a.m_month == 1 && a.m_day == 6);
|
||
|
||
if (!event_date::parse_eberhart_date_range("January 20-25", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 1 && b.m_day == 20);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == 1970 && e.m_month == 1 && e.m_day == 25);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("Summer", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cSummer && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("Summer-Winter", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cSummer && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
|
||
assert(e.m_prefix == cWinter && e.m_year == 1970 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("Early", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cEarly && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("1970", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("January 20-February 25", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 1 && b.m_day == 20);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == 1970 && e.m_month == 2 && e.m_day == 25);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("February 25, 1970", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 2 && b.m_day == 25);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("January 20-February 25, 1971", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 1 && b.m_day == 20);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == 1971 && e.m_month == 2 && e.m_day == 25);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
if (!event_date::parse_eberhart_date_range("January 20 1970-February 25, 1971", b, e, a, 1970)) return false;
|
||
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 1 && b.m_day == 20);
|
||
assert(e.m_prefix == cNoPrefix && e.m_year == 1971 && e.m_month == 2 && e.m_day == 25);
|
||
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
|
||
|
||
// These should all fail
|
||
if (event_date::parse_eberhart_date_range("January 20q 1970-February 25, 1971", b, e, a, 1970)) return false;
|
||
if (event_date::parse_eberhart_date_range("Januaryq 20 1970-February 25, 1971", b, e, a, 1970)) return false;
|
||
if (event_date::parse_eberhart_date_range("", b, e, a, 1970)) return false;
|
||
if (event_date::parse_eberhart_date_range(" ", b, e, a, 1970)) return false;
|
||
if (event_date::parse_eberhart_date_range(" , ", b, e, a, 1970)) return false;
|
||
|
||
return true;
|
||
}
|
||
|
||
static void print_nocr(const std::string& s)
|
||
{
|
||
std::string new_string;
|
||
|
||
for (uint32_t i = 0; i < s.size(); i++)
|
||
{
|
||
if ((s[i] != '\n') && (s[i] != 1))
|
||
new_string.push_back(s[i]);
|
||
else
|
||
new_string.push_back(' ');
|
||
}
|
||
|
||
uprintf("%s", new_string.c_str());
|
||
}
|
||
|
||
//-------------------------------------------------------------------
|
||
|
||
static void converters_test()
|
||
{
|
||
std::string blah;
|
||
blah.push_back(ANSI_SOFT_HYPHEN);
|
||
|
||
#if 0
|
||
// should print a dash (code page 1252 - ANSI Latin 1)
|
||
putc((char)ANSI_SOFT_HYPHEN, stdout);
|
||
// should print a dash
|
||
uprintf("%s\n", wchar_to_utf8(utf8_to_wchar(blah, CP_ACP)).c_str());
|
||
#endif
|
||
|
||
//fprintf(u8"<22>frightening vision<6F>");
|
||
//ufprintf(stderr, u8"<22>frightening vision<6F>");
|
||
assert(crc32((const uint8_t*)"TEST", 4) == 0xeeea93b8);
|
||
assert(crc32((const uint8_t*)"408tdsfjdsfjsdh893!;", 20) == 0xa044e016);
|
||
if (!test_eberhart_date()) return panic("test_eberhart_date failed!");
|
||
|
||
// rg hack hack
|
||
#if 0
|
||
//const char *p = "_Hello, [world](http://www.google.com)_ <br/><br/>This is a _test._ **COOL**\nBlah Blah\nZA ZB ZC \nZD ZE EF\nHDR\nThis is a test\n\nPara 1\n\nPara 2";
|
||
//const char* p = "Hello, [**world**](http://www.google.com). \nThis is a test.\n\nNew \\*paragraph\\*.";
|
||
//const char* p = "<br/><br/>[_B_](WWW.A.COM) **[C](WWW.B.COM)**<br/><br/>This is a test<br/><br/>Blah \nBlah \n\nNew (This is a test!).";
|
||
|
||
//bufprintf(pIn, "A\nB \nC\n_This is a blah_[XXXX](YYYY(S))");
|
||
|
||
//const char* p = u8R"(Chemist [Gustaf Ljunggren](https://www.google.com/url?q=https://en.wikipedia.org/wiki/Gustaf_Ljunggren_(chemist)&sa=D&source=editors&ust=1674889728009134&usg=AOvVaw2v_Cymx15I5Ic1eNEYeeBr)<29>of the Swedish National Defense Research Institute summarizes for the Swedish Defense staff his analysis of 27 finds of mysterious substances, allegedly from ghost rockets. None are radioactive and all have mundane explanations. (Anders Liljegren and Clas Svahn, <20>The Ghost Rockets,<2C> UFOs 1947<34>1987, Fortean Tomes, 1987, pp. 33<33>34))";
|
||
// const char* p = u8R"(Blah
|
||
//English clergyman and philosopher [_John Wilkins_](https://www.google.com/url?q=https://en.wikipedia.org/wiki/John_Wilkins&sa=D&source=editors&ust=1674889727243386&usg=AOvVaw1hw56rPPqRvDJzjdV0g8Zb) writes The Discovery of a World in the Moone, in which he highlights the similarities of the Earth and the Moon (seas, mountains, atmosphere) and concludes that the Moon is likely to be inhabited by living beings, whom the calls <20>Selenites.<2E> (Maria Avxentevskaya, <20>[How 17th Century](https://www.google.com/url?q=https://www.realclearscience.com/articles/2017/12/02/how_17th_century_dreamers_planned_to_reach_the_moon_110476.html&sa=D&source=editors&ust=1674889727243765&usg=AOvVaw13_nH4qqo0LYqJqnhq4_eI)<29>[Dreamers Planned to Reach the Moon,](https://www.google.com/url?q=https://www.realclearscience.com/articles/2017/12/02/how_17th_century_dreamers_planned_to_reach_the_moon_110476.html&sa=D&source=editors&ust=1674889727244030&usg=AOvVaw2K5FMN315Pjxq_xO7wp7Ga)<29> <br/><br/>Real Clear Science, December 2, 2017) )";
|
||
|
||
//const char* p = u8R"(Pierre Lagrange, <20>[_Agobard, la Magonie et les ovnis_,](https://www.google.com/url?q=https://pierrelagrangesociologie.files.wordpress.com/2020/08/lagrange-agobard-magonie-ufologie-lhistoire-440-2017-10-p28-29.pdf&sa=D&source=editors&ust=1674889727239396&usg=AOvVaw1U01Ykx3tRTQS4QKENJuGi)<29> Actualit<69>, no. 440 (October 2017): 28<32>29; Wikipedia, <20>[Magonia (mythology)](https://www.google.com/url?q=https://en.wikipedia.org/wiki/Magonia_(mythology)&sa=D&source=editors&ust=1674889727239728&usg=AOvVaw0JOQanVKKoRClyKQPK5SJi)<29>))";
|
||
const char* p = "<br/>blah<br/>_[Agobard,](www.blah.com)_<br/> blah<br/>blah <br/>[_Agobard_,](www.blah.com)<br/>";
|
||
|
||
//const char* p = "***[sssss](www.dddd.com)*** _Blah_ *Cool*_Zeek_";
|
||
//const char* p = "P1\nP2 \nP3\n\nP4\nP5\nP6\n\nP7\nP8 **Blah** _[ZEEK](WWW.Z.COM)_";
|
||
|
||
uprintf("Original markdown: %s\n", p);
|
||
uprintf("----------\n");
|
||
|
||
markdown_text_processor mt;
|
||
mt.init_from_markdown(p);
|
||
uprintf("Internal text: %s\n", mt.m_text.c_str());
|
||
uprintf("----------\n");
|
||
|
||
std::string plain;
|
||
mt.convert_to_plain(plain, false);
|
||
uprintf("Plain text: %s\n", mt.m_text.c_str());
|
||
uprintf("----------\n");
|
||
|
||
#if 0
|
||
markdown_text_processor a, b;
|
||
bool status = mt.split_last_parens(a, b);
|
||
uprintf("status: %u\n", status);
|
||
|
||
std::string a_md, b_md;
|
||
a.convert_to_markdown(a_md);
|
||
b.convert_to_markdown(b_md);
|
||
|
||
uprintf("A:\n%s\n", a_md.c_str());
|
||
|
||
uprintf("----------\n");
|
||
uprintf("B:\n%s\n", b_md.c_str());
|
||
#endif
|
||
|
||
|
||
#if 1
|
||
std::string md;
|
||
mt.convert_to_markdown(md, false);
|
||
|
||
#if 0
|
||
uprintf("Plain text:\n");
|
||
|
||
for (uint32_t i = 0; i < mt.m_text.size(); i++)
|
||
{
|
||
char c = mt.m_text[i];
|
||
if (c == ' ')
|
||
uprintf(".");
|
||
else
|
||
uprintf("%c", (uint8_t)c);
|
||
}
|
||
|
||
uprintf("%s\n", mt.m_text.c_str());
|
||
|
||
uprintf("\n------\n");
|
||
#endif
|
||
|
||
uprintf("Converted back to markdown:\n");
|
||
|
||
#if 0
|
||
for (uint32_t i = 0; i < md.size(); i++)
|
||
{
|
||
char c = md[i];
|
||
if (c == ' ')
|
||
uprintf(".");
|
||
else
|
||
uprintf("%c", (uint8_t)c);
|
||
}
|
||
#endif
|
||
|
||
uprintf("\n%s\n", md.c_str());
|
||
|
||
uprintf("\n------\n");
|
||
|
||
#endif
|
||
|
||
exit(0);
|
||
#endif
|
||
}
|
||
|
||
enum
|
||
{
|
||
cMonthFlag = 1,
|
||
cBeginToLateFlag = 2,
|
||
cSeasonFlag = 4,
|
||
cApproxFlag = 8,
|
||
cOrFlag = 16,
|
||
cDashFlag = 32,
|
||
cXXFlag = 64,
|
||
cFuzzyFlag = 128,
|
||
cSlashFlag = 256
|
||
};
|
||
|
||
static const struct
|
||
{
|
||
const char* m_pStr;
|
||
uint32_t m_flag;
|
||
uint32_t m_month;
|
||
date_prefix_t m_date_prefix;
|
||
} g_special_phrases[] =
|
||
{
|
||
{ "january", cMonthFlag, 1 },
|
||
{ "february", cMonthFlag, 2 },
|
||
{ "march", cMonthFlag, 3 },
|
||
{ "april", cMonthFlag, 4 },
|
||
{ "may", cMonthFlag, 5 },
|
||
{ "june", cMonthFlag, 6 },
|
||
{ "july", cMonthFlag, 7 },
|
||
{ "august", cMonthFlag, 8 },
|
||
{ "september", cMonthFlag, 9 },
|
||
{ "october", cMonthFlag, 10 },
|
||
{ "november", cMonthFlag, 11 },
|
||
{ "december", cMonthFlag, 12 },
|
||
|
||
{ "jan.", cMonthFlag, 1 },
|
||
{ "feb.", cMonthFlag, 2 },
|
||
{ "mar.", cMonthFlag, 3 },
|
||
{ "apr.", cMonthFlag, 4 },
|
||
{ "may.", cMonthFlag, 5 },
|
||
{ "jun.", cMonthFlag, 6 },
|
||
{ "jul.", cMonthFlag, 7 },
|
||
{ "aug.", cMonthFlag, 8 },
|
||
{ "sep.", cMonthFlag, 9 },
|
||
{ "oct.", cMonthFlag, 10 },
|
||
{ "nov.", cMonthFlag, 11 },
|
||
{ "dec.", cMonthFlag, 12 },
|
||
|
||
{ "late", cBeginToLateFlag, 0, cLate },
|
||
{ "early", cBeginToLateFlag, 0, cEarly },
|
||
{ "middle", cBeginToLateFlag, 0, cMiddleOf },
|
||
{ "end", cBeginToLateFlag, 0, cEndOf },
|
||
|
||
{ "spring", cSeasonFlag, 0, cSpring },
|
||
{ "summer", cSeasonFlag, 0, cSummer },
|
||
{ "autumn", cSeasonFlag, 0, cAutumn },
|
||
{ "fall", cSeasonFlag, 0, cFall },
|
||
{ "winter", cSeasonFlag, 0, cWinter },
|
||
{ "wint", cSeasonFlag, 0, cWinter },
|
||
|
||
{ "approx", cApproxFlag },
|
||
|
||
{ "jan", cMonthFlag, 1 },
|
||
{ "feb", cMonthFlag, 2 },
|
||
{ "mar", cMonthFlag, 3 },
|
||
{ "apr", cMonthFlag, 4 },
|
||
{ "may", cMonthFlag, 5 },
|
||
{ "jun", cMonthFlag, 6 },
|
||
{ "jul", cMonthFlag, 7 },
|
||
{ "aug", cMonthFlag, 8 },
|
||
{ "sep", cMonthFlag, 9 },
|
||
{ "oct", cMonthFlag, 10 },
|
||
{ "nov", cMonthFlag, 11 },
|
||
{ "dec", cMonthFlag, 12 },
|
||
|
||
{ "mid", cBeginToLateFlag, 0, cMiddleOf },
|
||
|
||
{ "or", cOrFlag },
|
||
{ "xx", cXXFlag },
|
||
{ "-", cDashFlag },
|
||
{ "?", cFuzzyFlag },
|
||
{ "/", cSlashFlag }
|
||
};
|
||
|
||
const uint32_t NUM_SPECIAL_PHRASES = sizeof(g_special_phrases) / sizeof(g_special_phrases[0]);
|
||
|
||
enum
|
||
{
|
||
cSpecialJan,
|
||
cSpecialFeb,
|
||
cSpecialMar,
|
||
cSpecialApr,
|
||
cSpecialMay,
|
||
cSpecialJun,
|
||
cSpecialJul,
|
||
cSpecialAug,
|
||
cSpecialSep,
|
||
cSpecialOct,
|
||
cSpecialNov,
|
||
cSpecialDec,
|
||
|
||
cSpecialJan2,
|
||
cSpecialFeb2,
|
||
cSpecialMar2,
|
||
cSpecialApr2,
|
||
cSpecialMay2,
|
||
cSpecialJun2,
|
||
cSpecialJul2,
|
||
cSpecialAug2,
|
||
cSpecialSep2,
|
||
cSpecialOct2,
|
||
cSpecialNov2,
|
||
cSpecialDec2,
|
||
|
||
cSpecialLate,
|
||
cSpecialEarly,
|
||
cSpecialMiddle,
|
||
cSpecialEnd,
|
||
|
||
cSpecialSpring,
|
||
cSpecialSummer,
|
||
cSpecialAutumn,
|
||
cSpecialFall,
|
||
cSpecialWinter,
|
||
cSpecialWinter2,
|
||
|
||
cSpecialApprox,
|
||
|
||
cSpecialJan3,
|
||
cSpecialFeb3,
|
||
cSpecialMar3,
|
||
cSpecialApr3,
|
||
cSpecialMay3,
|
||
cSpecialJun3,
|
||
cSpecialJul3,
|
||
cSpecialAug3,
|
||
cSpecialSep3,
|
||
cSpecialOct3,
|
||
cSpecialNov3,
|
||
cSpecialDec3,
|
||
|
||
cSpecialMid,
|
||
|
||
cSpecialOr,
|
||
cSpecialXX,
|
||
cSpecialDash,
|
||
cSpecialFuzzy,
|
||
cSpecialSlash,
|
||
|
||
cSpecialTotal
|
||
};
|
||
|
||
static int get_special_from_token(int64_t tok)
|
||
{
|
||
if (tok >= 0)
|
||
panic("Invalid token");
|
||
|
||
int64_t spec = -tok - 1;
|
||
if (spec >= cSpecialTotal)
|
||
panic("Invalid token");
|
||
|
||
return (int)spec;
|
||
}
|
||
|
||
static bool convert_nipcap_date(std::string date, event_date& begin_date, event_date& end_date, event_date& alt_date)
|
||
{
|
||
assert(cSpecialTotal == NUM_SPECIAL_PHRASES);
|
||
|
||
const uint32_t MIN_YEAR = 1860;
|
||
const uint32_t MAX_YEAR = 2012;
|
||
|
||
string_trim(date);
|
||
|
||
bool is_all_digits = true;
|
||
|
||
for (char& c : date)
|
||
{
|
||
if (c < 0)
|
||
return false;
|
||
|
||
c = (char)tolower((uint8_t)c);
|
||
|
||
if (!isdigit(c))
|
||
is_all_digits = false;
|
||
}
|
||
|
||
if (is_all_digits)
|
||
{
|
||
// Handle most common, simplest cases.
|
||
if (date.size() == 6)
|
||
{
|
||
// YYMMDD
|
||
int year = convert_hex_digit(date[0]) * 10 + convert_hex_digit(date[1]);
|
||
int month = convert_hex_digit(date[2]) * 10 + convert_hex_digit(date[3]);
|
||
int day = convert_hex_digit(date[4]) * 10 + convert_hex_digit(date[5]);
|
||
|
||
if (year <= 8)
|
||
year += 2000;
|
||
else
|
||
year += 1900;
|
||
|
||
if (month > 12)
|
||
return false;
|
||
|
||
if (day > 31)
|
||
return false;
|
||
|
||
begin_date.m_year = year;
|
||
if (month != 0)
|
||
{
|
||
begin_date.m_month = month ? month : -1;
|
||
|
||
if (day != 0)
|
||
begin_date.m_day = day;
|
||
}
|
||
else
|
||
{
|
||
if (day != 0)
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
else if (date.size() == 8)
|
||
{
|
||
// YYYYMMDD
|
||
int year = convert_hex_digit(date[0]) * 1000 + convert_hex_digit(date[1]) * 100 + convert_hex_digit(date[2]) * 10 + convert_hex_digit(date[3]);
|
||
int month = convert_hex_digit(date[4]) * 10 + convert_hex_digit(date[5]);
|
||
int day = convert_hex_digit(date[6]) * 10 + convert_hex_digit(date[7]);
|
||
|
||
if ((year < MIN_YEAR) || (year > MAX_YEAR))
|
||
return false;
|
||
|
||
if (month > 12)
|
||
return false;
|
||
|
||
if (day > 31)
|
||
return false;
|
||
|
||
if (month == 0)
|
||
{
|
||
if (day != 0)
|
||
return false;
|
||
|
||
begin_date.m_year = year;
|
||
}
|
||
else if (day == 0)
|
||
{
|
||
begin_date.m_year = year;
|
||
begin_date.m_month = month ? month : -1;
|
||
}
|
||
else
|
||
{
|
||
begin_date.m_year = year;
|
||
begin_date.m_month = month ? month : -1;
|
||
begin_date.m_day = day;
|
||
}
|
||
return true;
|
||
}
|
||
else
|
||
return false;
|
||
}
|
||
|
||
// Tokenize the input then only parse those cases we explictly support. Everything else is an error.
|
||
|
||
std::vector<int64_t> tokens;
|
||
std::vector<int> digits;
|
||
|
||
uint32_t special_flags = 0, cur_ofs = 0;
|
||
|
||
while (cur_ofs < date.size())
|
||
{
|
||
if (isdigit(date[cur_ofs]))
|
||
{
|
||
int64_t val = 0;
|
||
int num_digits = 0;
|
||
|
||
do
|
||
{
|
||
if (!isdigit(date[cur_ofs]))
|
||
break;
|
||
|
||
val = val * 10 + convert_hex_digit(date[cur_ofs++]);
|
||
if (val < 0)
|
||
return false;
|
||
|
||
num_digits++;
|
||
|
||
} while (cur_ofs < date.size());
|
||
|
||
tokens.push_back(val);
|
||
digits.push_back(num_digits);
|
||
}
|
||
else if (date[cur_ofs] == ' ')
|
||
{
|
||
cur_ofs++;
|
||
}
|
||
else
|
||
{
|
||
std::string cur_str(date.c_str() + cur_ofs);
|
||
|
||
int phrase_index;
|
||
for (phrase_index = 0; phrase_index < NUM_SPECIAL_PHRASES; phrase_index++)
|
||
if (string_begins_with(cur_str, g_special_phrases[phrase_index].m_pStr))
|
||
break;
|
||
|
||
if (phrase_index == NUM_SPECIAL_PHRASES)
|
||
return false;
|
||
|
||
tokens.push_back(-(phrase_index + 1));
|
||
digits.push_back(0);
|
||
|
||
cur_ofs += (uint32_t)strlen(g_special_phrases[phrase_index].m_pStr);
|
||
|
||
special_flags |= g_special_phrases[phrase_index].m_flag;
|
||
}
|
||
}
|
||
|
||
assert(tokens.size() == digits.size());
|
||
|
||
// Just not supporting slashes in here.
|
||
if (special_flags & cSlashFlag)
|
||
return false;
|
||
|
||
if (!tokens.size())
|
||
return false;
|
||
|
||
// First token must be a number
|
||
if ((digits[0] != 2) && (digits[0] != 4) && (digits[0] != 6) && (digits[0] != 8))
|
||
return false;
|
||
|
||
if (special_flags & cSeasonFlag)
|
||
{
|
||
// Either YYSeason or YYYYSeason
|
||
|
||
if (tokens.size() != 2)
|
||
return false;
|
||
|
||
int year = 0;
|
||
if (digits[0] == 2)
|
||
year = (int)tokens[0] + 1900;
|
||
else if (digits[0] == 4)
|
||
{
|
||
year = (int)tokens[0];
|
||
if ((year < MIN_YEAR) || (year > MAX_YEAR))
|
||
return false;
|
||
}
|
||
else
|
||
return false;
|
||
|
||
begin_date.m_year = year;
|
||
|
||
if (tokens[1] >= 0)
|
||
return false;
|
||
|
||
int64_t special_index = -tokens[1] - 1;
|
||
if (special_index >= NUM_SPECIAL_PHRASES)
|
||
return false;
|
||
|
||
begin_date.m_prefix = g_special_phrases[special_index].m_date_prefix;
|
||
|
||
return true;
|
||
}
|
||
else if (special_flags & cMonthFlag)
|
||
{
|
||
// Not supporting explicit month
|
||
return false;
|
||
}
|
||
|
||
// No explicit season and no month - handle XX's
|
||
if ((tokens.size() == 2) && (tokens[1] < 0) && (get_special_from_token(tokens[1]) == cSpecialXX))
|
||
{
|
||
if (digits[0] == 4)
|
||
{
|
||
// YYMMXX
|
||
int year = 1900 + (int)(tokens[0] / 100);
|
||
int month = (int)(tokens[0] % 100);
|
||
|
||
if (month > 12)
|
||
return false;
|
||
|
||
begin_date.m_year = year;
|
||
begin_date.m_month = month ? month : -1;
|
||
}
|
||
else if (digits[0] == 6)
|
||
{
|
||
// YYYYMMXX
|
||
|
||
int year = (int)(tokens[0] / 100);
|
||
if ((year < MIN_YEAR) || (year > MAX_YEAR))
|
||
return false;
|
||
|
||
int month = (int)(tokens[0] % 100);
|
||
if (month > 12)
|
||
return false;
|
||
|
||
begin_date.m_year = year;
|
||
begin_date.m_month = month ? month : -1;
|
||
}
|
||
else
|
||
{
|
||
return false;
|
||
}
|
||
|
||
return true;
|
||
}
|
||
else if ((tokens.size() == 3) && (tokens[1] < 0) && (tokens[2] < 0) && (get_special_from_token(tokens[1]) == cSpecialXX) && (get_special_from_token(tokens[2]) == cSpecialXX))
|
||
{
|
||
if (digits[0] == 2)
|
||
{
|
||
// YYXXXX
|
||
begin_date.m_year = (int)tokens[0] + 1900;
|
||
}
|
||
else if (digits[0] == 4)
|
||
{
|
||
// YYYYXXXX
|
||
begin_date.m_year = (int)tokens[0];
|
||
if ((begin_date.m_year < MIN_YEAR) || (begin_date.m_year > MAX_YEAR))
|
||
return false;
|
||
}
|
||
else
|
||
return false;
|
||
|
||
return true;
|
||
}
|
||
|
||
if (special_flags & cXXFlag)
|
||
return false;
|
||
|
||
if (digits[0] == 2)
|
||
{
|
||
// YY
|
||
begin_date.m_year = (int)tokens[0] + 1900;
|
||
}
|
||
else if (digits[0] == 4)
|
||
{
|
||
// YYMM
|
||
begin_date.m_year = (int)(tokens[0] / 100) + 1900;
|
||
|
||
begin_date.m_month = (int)(tokens[0] % 100);
|
||
if (begin_date.m_month > 12)
|
||
return false;
|
||
else if (!begin_date.m_month)
|
||
begin_date.m_month = -1;
|
||
}
|
||
else if (digits[0] == 6)
|
||
{
|
||
// YYMMDD
|
||
begin_date.m_year = (int)(tokens[0] / 10000) + 1900;
|
||
|
||
begin_date.m_month = (int)((tokens[0] / 100) % 100);
|
||
if (begin_date.m_month > 12)
|
||
return false;
|
||
else if (!begin_date.m_month)
|
||
begin_date.m_month = -1;
|
||
|
||
begin_date.m_day = (int)(tokens[0] % 100);
|
||
if (begin_date.m_day >= 31)
|
||
return false;
|
||
|
||
if ((begin_date.m_month == -1) && (begin_date.m_day))
|
||
return false;
|
||
}
|
||
else if (digits[0] == 8)
|
||
{
|
||
// YYYYMMDD
|
||
begin_date.m_year = (int)(tokens[0] / 10000);
|
||
if ((begin_date.m_year < MIN_YEAR) || (begin_date.m_year > MAX_YEAR))
|
||
return false;
|
||
|
||
begin_date.m_month = (int)((tokens[0] / 100) % 100);
|
||
if (begin_date.m_month > 12)
|
||
return false;
|
||
else if (!begin_date.m_month)
|
||
begin_date.m_month = -1;
|
||
|
||
begin_date.m_day = (int)(tokens[0] % 100);
|
||
if (begin_date.m_day >= 31)
|
||
return false;
|
||
|
||
if ((begin_date.m_month == -1) && (begin_date.m_day))
|
||
return false;
|
||
}
|
||
else
|
||
{
|
||
return false;
|
||
}
|
||
|
||
if ((tokens.size() == 2) && (tokens[1] < 0) &&
|
||
((get_special_from_token(tokens[1]) >= cSpecialLate) && (get_special_from_token(tokens[1]) <= cSpecialEnd) ||
|
||
(get_special_from_token(tokens[1]) == cSpecialMid))
|
||
)
|
||
{
|
||
// 2 tokens, ends in "late", "middle", "early" etc.
|
||
if (begin_date.m_day != -1)
|
||
return false;
|
||
|
||
if (tokens[1] >= 0)
|
||
return false;
|
||
|
||
int64_t special_index = -tokens[1] - 1;
|
||
if (special_index >= NUM_SPECIAL_PHRASES)
|
||
return false;
|
||
|
||
begin_date.m_prefix = g_special_phrases[special_index].m_date_prefix;
|
||
|
||
return true;
|
||
}
|
||
|
||
if (special_flags & cBeginToLateFlag)
|
||
return false;
|
||
|
||
if ((tokens.size() == 3) && (tokens[1] < 0) && (get_special_from_token(tokens[1]) == cSpecialDash) && (tokens[2] >= 0))
|
||
{
|
||
if ((digits[0] == 6) && (digits[2] == 2))
|
||
{
|
||
// YYMMDD-DD
|
||
end_date = begin_date;
|
||
|
||
if (tokens[2] > 31)
|
||
return false;
|
||
|
||
end_date.m_day = (int)tokens[2];
|
||
|
||
return true;
|
||
}
|
||
else if ((digits[0] == 4) && (digits[2] == 4))
|
||
{
|
||
// YYMM-YYMM
|
||
|
||
end_date.m_year = (int)(tokens[2] / 100) + 1900;
|
||
|
||
end_date.m_month = (int)(tokens[2] % 100);
|
||
if (end_date.m_month > 12)
|
||
return false;
|
||
else if (!end_date.m_month)
|
||
end_date.m_month = -1;
|
||
|
||
return true;
|
||
}
|
||
else if ((digits[0] == 6) && (digits[2] == 6))
|
||
{
|
||
// YYMMDD-YYMMDD
|
||
end_date.m_year = (int)(tokens[2] / 10000) + 1900;
|
||
|
||
end_date.m_month = (int)((tokens[2] / 100) % 100);
|
||
if (end_date.m_month > 12)
|
||
return false;
|
||
else if (!end_date.m_month)
|
||
end_date.m_month = -1;
|
||
|
||
end_date.m_day = (int)(tokens[2] % 100);
|
||
if (end_date.m_day >= 31)
|
||
return false;
|
||
|
||
return true;
|
||
}
|
||
else if ((digits[0] == 8) && (digits[2] == 8))
|
||
{
|
||
// YYYYMMDD-YYYYMMDD
|
||
end_date.m_year = (int)(tokens[2] / 10000);
|
||
if ((end_date.m_year < MIN_YEAR) || (end_date.m_year > MAX_YEAR))
|
||
return false;
|
||
|
||
end_date.m_month = (int)((tokens[2] / 100) % 100);
|
||
if (end_date.m_month > 12)
|
||
return false;
|
||
else if (!end_date.m_month)
|
||
end_date.m_month = -1;
|
||
|
||
end_date.m_day = (int)(tokens[2] % 100);
|
||
if (end_date.m_day >= 31)
|
||
return false;
|
||
|
||
if ((end_date.m_month == -1) && (end_date.m_day))
|
||
return false;
|
||
|
||
return true;
|
||
}
|
||
else
|
||
{
|
||
return false;
|
||
}
|
||
}
|
||
|
||
if (special_flags & cDashFlag)
|
||
return false;
|
||
|
||
if ((tokens.size() == 2) && (get_special_from_token(tokens[1]) == cSpecialFuzzy))
|
||
{
|
||
begin_date.m_fuzzy = true;
|
||
return true;
|
||
}
|
||
|
||
if (special_flags & cFuzzyFlag)
|
||
return false;
|
||
|
||
if ((tokens.size() == 3) && (get_special_from_token(tokens[1]) == cSpecialOr))
|
||
{
|
||
if ((digits[0] == 2) && (digits[2] == 2))
|
||
{
|
||
// YY or YY
|
||
alt_date.m_year = (int)tokens[2] + 1900;
|
||
|
||
return true;
|
||
}
|
||
else if ((digits[0] == 4) && (digits[2] == 4))
|
||
{
|
||
// YYMM or YYMM
|
||
alt_date.m_year = (int)(tokens[2] / 100) + 1900;
|
||
|
||
alt_date.m_month = (int)(tokens[2] % 100);
|
||
if (alt_date.m_month > 12)
|
||
return false;
|
||
else if (!alt_date.m_month)
|
||
alt_date.m_month = -1;
|
||
|
||
return true;
|
||
}
|
||
else if ((digits[0] == 6) && (digits[2] == 2))
|
||
{
|
||
// YYMMDD or DD
|
||
alt_date = begin_date;
|
||
|
||
if (tokens[2] > 31)
|
||
return false;
|
||
|
||
alt_date.m_day = (int)tokens[2];
|
||
|
||
return true;
|
||
}
|
||
else if ((digits[0] == 6) && (digits[2] == 6))
|
||
{
|
||
// YYMMDD or YYMMDD
|
||
alt_date.m_year = (int)(tokens[2] / 10000) + 1900;
|
||
|
||
alt_date.m_month = (int)((tokens[2] / 100) % 100);
|
||
if (alt_date.m_month > 12)
|
||
return false;
|
||
else if (!alt_date.m_month)
|
||
return false;
|
||
|
||
alt_date.m_day = (int)(tokens[2] % 100);
|
||
if (alt_date.m_day >= 31)
|
||
return false;
|
||
|
||
return true;
|
||
}
|
||
else
|
||
{
|
||
return false;
|
||
}
|
||
}
|
||
|
||
if (special_flags & cOrFlag)
|
||
return false;
|
||
|
||
if ((tokens.size() == 2) && ((tokens[1] < 0) && (get_special_from_token(tokens[1]) == cSpecialApprox)))
|
||
{
|
||
begin_date.m_approx = true;
|
||
return true;
|
||
}
|
||
|
||
if (special_flags & cApproxFlag)
|
||
return false;
|
||
|
||
if (tokens.size() > 1)
|
||
return false;
|
||
|
||
return true;
|
||
}
|
||
|
||
const int NICAP_FIRST_CAT = 1;
|
||
const int NICAP_LAST_CAT = 11;
|
||
|
||
static const char* g_nicap_categories[11] =
|
||
{
|
||
"01 - Distant Encounters",
|
||
"02 - Close Encounters",
|
||
"03 - EME Cases",
|
||
"04 - Animal Reactions",
|
||
"05 - Medical Incidents",
|
||
"06 - Trace Cases",
|
||
"07 - Entity Cases",
|
||
"08 - Photographic Cases",
|
||
"09 - RADAR Cases",
|
||
"10 - Nuclear Connection",
|
||
"11 - Aviation Cases"
|
||
};
|
||
|
||
static const char* g_nicap_archive_urls[] =
|
||
{
|
||
"040228USS%5FSupplydir.htm",
|
||
"421019guadalcanaldir.htm",
|
||
"4409xxoakridge%5Fdir.htm",
|
||
"470116northseadir.htm",
|
||
"470117northseadir.htm",
|
||
"470828japandir.htm",
|
||
"490723delphidir.htm",
|
||
"500110lasvegasdir.htm",
|
||
"500227coultervilledir.htm",
|
||
"500309selfridgedir.htm",
|
||
"500322%5F4925dir.htm",
|
||
"500529mtvernondir.htm",
|
||
"510709dir.htm",
|
||
"520425dir.htm",
|
||
"520501georgedir.htm",
|
||
"520512roswelldir.htm",
|
||
"520526koreadir.htm",
|
||
"520619goosedir.htm",
|
||
"520719wnsdir.htm",
|
||
"520723jamestown%5F01%5Fdir.htm",
|
||
"520723jamestown%5F02%5Fdir.htm",
|
||
"520729dir.htm",
|
||
"520729langley%5Fdir.htm",
|
||
"5207xxdir.htm",
|
||
"520807sanantondir.htm",
|
||
"530213carswelldir.htm",
|
||
"530416eprairiedir.htm",
|
||
"530618iwodir.htm",
|
||
"531016presqudir.htm",
|
||
"531228marysvilledir.htm",
|
||
"571102level%5Fdir.htm",
|
||
"571106b.htm",
|
||
"571110dir.htm",
|
||
"580505dir.htm",
|
||
"640629dir.htm",
|
||
"640904dir.htm",
|
||
"650916dir.htm",
|
||
"660314dir.htm",
|
||
"730202dir.htm",
|
||
"731120mtvernon%5Fdir.htm",
|
||
"770701avianodir.htm",
|
||
"7710xxfostoriadir.htm",
|
||
"801227rendledir.htm",
|
||
"821004ukraine%5Fdir.htm",
|
||
"861117alaskadir.htm",
|
||
"870823dir.htm",
|
||
"aca%5F540621.htm",
|
||
"adickesdir.htm",
|
||
"alamo670302dir.htm",
|
||
"alamos500225Bdir.htm",
|
||
"alamos520722dir.htm",
|
||
"albany520226dir.htm",
|
||
"albuq490106dir.htm",
|
||
"albuq510825dir.htm",
|
||
"albuq520510dir.htm",
|
||
"albuq520528dir.htm",
|
||
"ar-570730dir.htm",
|
||
"ar-641030dir.htm",
|
||
"ar-770126.htm",
|
||
"ar-hubbarddir.htm",
|
||
"arlington470707dir.htm",
|
||
"ashley480408dir.htm",
|
||
"balt520721dir.htm",
|
||
"barradir.htm",
|
||
"benson520403dir.htm",
|
||
"bermuda490124dir.htm",
|
||
"blackstone480724docs1.htm",
|
||
"bonlee501023dir.htm",
|
||
"candir.htm",
|
||
"canogapark571111dir.htm",
|
||
"cashlandir.htm",
|
||
"cavecreek471014dir.htm",
|
||
"chamble480726docs1.htm",
|
||
"colosprings520709dir.htm",
|
||
"columbus520618dir.htm",
|
||
"compass.htm",
|
||
"contrexeville761214dir.htm",
|
||
"cortez490127dir.htm",
|
||
"costarica1dir.htm",
|
||
"coynedir.htm",
|
||
"daggett.htm",
|
||
"dayton471020dir.htm",
|
||
"dayton520712dir.htm",
|
||
"dillon490403dir.htm",
|
||
"dmonthan490516dir.htm",
|
||
"dodgeville4710xxdir.htm",
|
||
"elko490502dir.htm",
|
||
"ellsworth.htm",
|
||
"en-590626dir.htm",
|
||
"f-86dir.htm",
|
||
"fairchild520120dir.htm",
|
||
"fayette491228dir.htm",
|
||
"fortdixdir.htm",
|
||
"france57.htm",
|
||
"ftbliss490519dir.htm",
|
||
"ftmyers501206dir.htm",
|
||
"ftrich470917dir.htm",
|
||
"ftsumner470710dir.htm",
|
||
"fulda520602dir.htm",
|
||
"george520501dir.htm",
|
||
"glenburnie520329dir.htm",
|
||
"gormandir.htm",
|
||
"greenville480419dir.htm",
|
||
"harmondir.htm",
|
||
"hawaii.htm",
|
||
"hawesville870722dir.htm",
|
||
"hickam490104dir.htm",
|
||
"holloman491213dir.htm",
|
||
"hood490427dir.htm",
|
||
"houston520520dir.htm",
|
||
"hynekrv6.htm",
|
||
"indy480731dir.htm",
|
||
"keesler520507dir.htm",
|
||
"keywest501114dir.htm",
|
||
"kirtland500321dir.htm",
|
||
"kirtland800809dir.htm",
|
||
"knoxville500301dir.htm",
|
||
"lmeade520402dir.htm",
|
||
"longmead520417dir.htm",
|
||
"louisville520615dir.htm",
|
||
"lubbock520802dir.htm",
|
||
"lukedir.htm",
|
||
"lynn520701dir.htm",
|
||
"mainbrace.htm",
|
||
"malmstrom75dir.htm",
|
||
"maple480829dir.htm",
|
||
"marshall520429dir.htm",
|
||
"mcchord520617dir.htm",
|
||
"meromdir.htm",
|
||
"minn.htm",
|
||
"minot681028dir.htm",
|
||
"mobile520722dir.htm",
|
||
"mongodir.htm",
|
||
"monroe480528dir.htm",
|
||
"moses520501dir.htm",
|
||
"mtvernon.htm",
|
||
"mufon%5Farg0809%5F83.htm",
|
||
"mufon%5Fau0513%5F83.htm",
|
||
"mufon%5Fau0520%5F83.htm",
|
||
"mufon%5Fau0722%5F83.htm",
|
||
"mufon%5Fbo0302%5F83.htm",
|
||
"mufon%5Fbr0612%5F83.htm",
|
||
"mufon%5Fgb0119%5F83.htm",
|
||
"mufon%5Fgb0315%5F83.htm",
|
||
"mufon%5Fgb0812%5F83.htm",
|
||
"mufon%5Fmx0000%5F83.htm",
|
||
"mufon%5Fru0714%5F83.htm",
|
||
"mufon%5Fru0826%5F83.htm",
|
||
"musk5307XXdir.htm",
|
||
"naha520422dir.htm",
|
||
"nahant520723dir.htm",
|
||
"natcity520513dir.htm",
|
||
"ncp-oakridge501013.htm",
|
||
"nederland660206dir.htm",
|
||
"nellisdir.htm",
|
||
"nkorea520526dir.htm",
|
||
"norwooddir.htm",
|
||
"oakridge501105dir.htm",
|
||
"oakridge511207dir.htm",
|
||
"ontario520412dir.htm",
|
||
"osceola520725dir.htm",
|
||
"paintsville020114dir.htm",
|
||
"palomar491014dir.htm",
|
||
"palomar491125dir.htm",
|
||
"phoenix520405dir.htm",
|
||
"phoenix520808dir.htm",
|
||
"placerville470814dir.htm",
|
||
"plymouth500427dir.htm",
|
||
"pontiac520427dir.htm",
|
||
"portagedir.htm",
|
||
"portland470704dir.htm",
|
||
"pottsdown520723dir.htm",
|
||
"ramore530630dir.htm",
|
||
"rd-750503dir.htm",
|
||
"reno490905dir.htm",
|
||
"reseda570328dir.htm",
|
||
"rockville520722dir.htm",
|
||
"rogue490524dir.htm",
|
||
"roseville520427dir.htm",
|
||
"roswell520217dir.htm",
|
||
"russia551014dir.htm",
|
||
"sanacacia480717dir.htm",
|
||
"sandiego521217dir.htm",
|
||
"sands480405.htm",
|
||
"sands490424dir.htm",
|
||
"sanmarcos520721dir.htm",
|
||
"savannah520510dir.htm",
|
||
"selfridge511124dir.htm",
|
||
"sharondir.htm",
|
||
"shreve520416dir.htm",
|
||
"springer490425dir.htm",
|
||
"stmaries470703dir.htm",
|
||
"sts80.htm",
|
||
"tehrandir.htm",
|
||
"temple520406dir.htm",
|
||
"terredir.htm",
|
||
"tucson490428dir.htm",
|
||
"ubatubadir.htm",
|
||
"vaughn47latedir.htm",
|
||
"vaughn481103dir.htm",
|
||
"vincennes.htm",
|
||
"walnutlake520525dir.htm",
|
||
"washington520713dir.htm",
|
||
"wsands2dir.htm",
|
||
"yuma520417dir.htm",
|
||
"yuma520427dir.htm",
|
||
"zamoradir.htm",
|
||
"charleston1980.htm"
|
||
};
|
||
|
||
const uint32_t NUM_NICAP_ARCHIVE_URLS = sizeof(g_nicap_archive_urls) / sizeof(g_nicap_archive_urls[0]);
|
||
|
||
bool convert_nicap(unordered_string_set& unique_urls)
|
||
{
|
||
string_vec lines;
|
||
bool utf8_flag = false;
|
||
|
||
if (!read_text_file("nicap1_150.md", lines, true, &utf8_flag))
|
||
panic("Failed reading input file");
|
||
if (!utf8_flag)
|
||
panic("Expecting utf8 data");
|
||
|
||
if (!read_text_file("nicap151_334.md", lines, true, &utf8_flag))
|
||
panic("Failed reading input file");
|
||
if (!utf8_flag)
|
||
panic("Expecting utf8 data");
|
||
|
||
string_vec new_lines;
|
||
for (std::string& str : lines)
|
||
if (str.size())
|
||
new_lines.push_back(str);
|
||
|
||
lines.swap(new_lines);
|
||
if (!lines.size())
|
||
panic("File empty");
|
||
|
||
json js_doc = json::object();
|
||
js_doc["NICAP Data"] = json::array();
|
||
|
||
auto& js_doc_array = js_doc["NICAP Data"];
|
||
|
||
std::string prev_date, prev_orig_desc;
|
||
|
||
uint32_t num_repeated_recs = 0;
|
||
|
||
uint32_t record_index = 0;
|
||
uint32_t cur_line_index = 0;
|
||
while (cur_line_index < lines.size())
|
||
{
|
||
std::string date(lines[cur_line_index]);
|
||
if (!date.size())
|
||
panic("Invalid date");
|
||
|
||
cur_line_index++;
|
||
|
||
std::string ref;
|
||
|
||
if (date.front() == '[')
|
||
{
|
||
ref = date;
|
||
|
||
markdown_text_processor mtp;
|
||
mtp.init_from_markdown(ref.c_str());
|
||
|
||
mtp.fix_redirect_urls();
|
||
|
||
if (mtp.m_links.size())
|
||
{
|
||
for (uint32_t i = 0; i < mtp.m_links.size(); i++)
|
||
{
|
||
std::string& s = mtp.m_links[i];
|
||
uint32_t j;
|
||
for (j = 0; j < NUM_NICAP_ARCHIVE_URLS; j++)
|
||
if (string_find_first(s, g_nicap_archive_urls[j]) != -1)
|
||
break;
|
||
|
||
if (j < NUM_NICAP_ARCHIVE_URLS)
|
||
mtp.m_links[i] = "https://web.archive.org/web/100/" + mtp.m_links[i];
|
||
}
|
||
}
|
||
|
||
for (uint32_t i = 0; i < mtp.m_links.size(); i++)
|
||
unique_urls.insert(mtp.m_links[i]);
|
||
|
||
for (auto& str : mtp.m_links)
|
||
str = fix_bar_urls(str);
|
||
|
||
ref.clear();
|
||
mtp.convert_to_markdown(ref, true);
|
||
|
||
// 6+2+2+1 chars
|
||
if (date.size() < 11)
|
||
panic("Invalid date");
|
||
|
||
date.erase(0, 1);
|
||
|
||
uint32_t i;
|
||
bool found_end_bracket = false;
|
||
for (i = 0; i < date.size(); i++)
|
||
{
|
||
if (date[i] == ']')
|
||
{
|
||
date.erase(i, date.size() - i);
|
||
found_end_bracket = true;
|
||
break;
|
||
}
|
||
}
|
||
|
||
if (!found_end_bracket)
|
||
panic("Invalid date");
|
||
}
|
||
|
||
event_date begin_date, end_date, alt_date;
|
||
|
||
bool status = convert_nipcap_date(date, begin_date, end_date, alt_date);
|
||
if (!status)
|
||
panic("Failed converting NICAP date");
|
||
|
||
// Get first line of city
|
||
if (cur_line_index == lines.size())
|
||
panic("Out of lines");
|
||
|
||
std::string city(lines[cur_line_index++]);
|
||
if (!city.size() || (city == "* * *"))
|
||
panic("Expected city");
|
||
|
||
if (cur_line_index == lines.size())
|
||
panic("Failed parsing NICAP data");
|
||
|
||
// Look ahead for category, 1-11
|
||
int cat_index = -1;
|
||
int cat_line_index = -1;
|
||
|
||
for (uint32_t i = 0; i < 5; i++)
|
||
{
|
||
if ((cur_line_index + i) >= lines.size())
|
||
break;
|
||
|
||
const std::string& s = lines[cur_line_index + i];
|
||
if ((s.size() <= 2) && (string_is_digits(s)))
|
||
{
|
||
int val = atoi(s.c_str());
|
||
if ((val >= NICAP_FIRST_CAT) && (val <= NICAP_LAST_CAT))
|
||
{
|
||
cat_index = val;
|
||
cat_line_index = cur_line_index + i;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
if (cat_index == -1)
|
||
panic("Can't find category");
|
||
|
||
for (int i = cur_line_index; i < (cat_line_index - 1); i++)
|
||
city = combine_strings(city, lines[i]);
|
||
|
||
// Get state or country, which is right before the category
|
||
std::string state_or_country(lines[cat_line_index - 1]);
|
||
if (!state_or_country.size() || (state_or_country == "* * * "))
|
||
panic("Expected state or country");
|
||
|
||
if (cur_line_index == lines.size())
|
||
panic("Failed parsing NICAP data");
|
||
|
||
cur_line_index = cat_line_index + 1;
|
||
|
||
string_vec rec;
|
||
|
||
while (cur_line_index < lines.size())
|
||
{
|
||
const std::string& str = lines[cur_line_index];
|
||
|
||
if (str == "* * *")
|
||
{
|
||
cur_line_index++;
|
||
|
||
break;
|
||
}
|
||
else if (str.size() && str[0] == '[')
|
||
break;
|
||
|
||
if ((str.size() >= 2) && isdigit((uint8_t)str[0]) && isdigit((uint8_t)str[1]))
|
||
{
|
||
event_date next_begin_date, next_end_date, next_alt_date;
|
||
bool spec_status = convert_nipcap_date(str, next_begin_date, next_end_date, next_alt_date);
|
||
if (spec_status)
|
||
break;
|
||
}
|
||
|
||
rec.push_back(str);
|
||
cur_line_index++;
|
||
}
|
||
|
||
if (!rec.size())
|
||
panic("Failed parsing record");
|
||
|
||
std::string code, bb;
|
||
int rating = -1;
|
||
bool nc_flag = false, lc_flag = false;
|
||
|
||
for (int i = 0; i < std::min(5, (int)rec.size()); i++)
|
||
{
|
||
if (rec[i] == "NC")
|
||
{
|
||
nc_flag = true;
|
||
rec.erase(rec.begin() + i);
|
||
break;
|
||
}
|
||
}
|
||
|
||
for (int i = 0; i < std::min(5, (int)rec.size()); i++)
|
||
{
|
||
if (rec[i] == "LC")
|
||
{
|
||
lc_flag = true;
|
||
rec.erase(rec.begin() + i);
|
||
break;
|
||
}
|
||
}
|
||
|
||
if (!rec.size())
|
||
panic("Invalid record");
|
||
|
||
if (string_is_alpha(rec[0]) && (rec[0].size() <= 2))
|
||
{
|
||
if ((rec[0] == "A") || (rec[0] == "R") || (rec[0] == "En") || (rec[0] == "An") || (rec[0] == "C") ||
|
||
(rec[0] == "E") || (rec[0] == "D") || (rec[0] == "M") || (rec[0] == "I") || (rec[0] == "U") || (rec[0] == "N") || (rec[0] == "T") || (rec[0] == "aM"))
|
||
{
|
||
code = rec[0];
|
||
rec.erase(rec.begin());
|
||
|
||
if (!rec.size())
|
||
panic("Invalid record");
|
||
}
|
||
else
|
||
{
|
||
panic("Unknown code");
|
||
}
|
||
}
|
||
|
||
int i, j = std::min(5, (int)rec.size());
|
||
for (i = 0; i < j; i++)
|
||
if ((rec[i] == "BBU") || (rec[i] == "BB"))
|
||
break;
|
||
|
||
bool found_bbu = false;
|
||
if (i < j)
|
||
{
|
||
bb = rec[i];
|
||
rec.erase(rec.begin() + i);
|
||
|
||
if (!rec.size())
|
||
panic("Invalid record");
|
||
|
||
found_bbu = true;
|
||
}
|
||
|
||
if (string_is_digits(rec[0]) && (rec[0].size() == 1))
|
||
{
|
||
int val = atoi(rec[0].c_str());
|
||
if ((val < 0) || (val > 5))
|
||
panic("Invalid rating");
|
||
rating = val;
|
||
|
||
rec.erase(rec.begin());
|
||
|
||
if (!rec.size())
|
||
panic("Invalid record");
|
||
}
|
||
|
||
if (!found_bbu && string_is_digits(rec[0]) && (rec[0].size() >= 2))
|
||
{
|
||
int val = atoi(rec[0].c_str());
|
||
if ((val >= 12) && (val <= 12607))
|
||
{
|
||
bb = rec[0];
|
||
|
||
rec.erase(rec.begin());
|
||
|
||
if (!rec.size())
|
||
panic("Invalid record");
|
||
}
|
||
else
|
||
{
|
||
panic("Bad BB #");
|
||
}
|
||
}
|
||
|
||
json js;
|
||
js["date"] = begin_date.get_string();
|
||
|
||
if (end_date.is_valid())
|
||
js["end_date"] = end_date.get_string();
|
||
|
||
if (alt_date.is_valid())
|
||
js["alt_date"] = alt_date.get_string();
|
||
|
||
js["ref"] = json::array();
|
||
if (ref.size())
|
||
js["ref"].push_back("NICAP: " + ref);
|
||
js["ref"].push_back("[NICAP Listing by Date PDF](http://www.nicap.org/NSID/NSID_DBListingbyDate.pdf)");
|
||
|
||
if (state_or_country.size())
|
||
js["location"] = city + ", " + state_or_country;
|
||
else
|
||
js["location"] = city;
|
||
|
||
std::string desc;
|
||
for (const std::string& s : rec)
|
||
desc = combine_strings(desc, s);
|
||
|
||
assert(cat_index >= 1 && cat_index <= 11);
|
||
|
||
const std::string orig_desc(desc);
|
||
|
||
desc = desc + string_format(" (NICAP: %s", g_nicap_categories[cat_index - 1]);
|
||
if (code.size())
|
||
desc = desc + string_format(", Code: %s", code.c_str());
|
||
if (rating != -1)
|
||
desc = desc + string_format(", Rating: %i", rating);
|
||
if (bb.size())
|
||
desc = desc + string_format(", BB: %s", bb.c_str());
|
||
if (nc_flag)
|
||
desc = desc + string_format(", NC");
|
||
if (lc_flag)
|
||
desc = desc + string_format(", LC");
|
||
desc += ")";
|
||
|
||
if ((desc[0] >= 0) && islower((uint8_t)desc[0]))
|
||
desc[0] = (char)toupper((uint8_t)desc[0]);
|
||
|
||
js["desc"] = desc;
|
||
js["source"] = "NICAP_DB";
|
||
js["source_id"] = "NICAP_DB" + string_format("_%u", record_index);
|
||
|
||
if ((prev_orig_desc.size()) && (orig_desc == prev_orig_desc) && (js["date"] == prev_date))
|
||
{
|
||
// It's a repeated record, with just a different category.
|
||
std::string new_desc(js_doc_array.back()["desc"]);
|
||
|
||
new_desc += string_format(" (NICAP: %s)", g_nicap_categories[cat_index - 1]);
|
||
|
||
js_doc_array.back()["desc"] = new_desc;
|
||
|
||
num_repeated_recs++;
|
||
}
|
||
else
|
||
{
|
||
prev_date = js["date"];
|
||
prev_orig_desc = orig_desc;
|
||
|
||
js_doc_array.push_back(js);
|
||
}
|
||
|
||
#if 0
|
||
uprintf("Record: %u\n", record_index);
|
||
|
||
uprintf("Date string: %s\n", date.c_str());
|
||
|
||
uprintf("Date: %s\n", begin_date.get_string().c_str());
|
||
|
||
if (end_date.is_valid())
|
||
uprintf("End date: %s\n", end_date.get_string().c_str());
|
||
|
||
if (alt_date.is_valid())
|
||
uprintf("Alt date: %s\n", alt_date.get_string().c_str());
|
||
|
||
if (ref.size())
|
||
uprintf("Ref: %s\n", ref.c_str());
|
||
|
||
uprintf("City: %s\n", city.c_str());
|
||
uprintf("State or County: %s\n", state_or_country.c_str());
|
||
|
||
uprintf("Category: %i\n", cat_index);
|
||
|
||
if (code.size())
|
||
uprintf("NICAP code: %s\n", code.c_str());
|
||
if (rating != -1)
|
||
uprintf("NICAP Rating: %u\n", rating);
|
||
if (bb.size())
|
||
uprintf("NICAP BB: %s\n", bb.c_str());
|
||
if (nc_flag || lc_flag)
|
||
uprintf("NICAP nc_flag: %u lc_flag: %u\n", nc_flag, lc_flag);
|
||
|
||
uprintf("Record:\n");
|
||
for (const std::string& s : rec)
|
||
uprintf("%s\n", s.c_str());
|
||
|
||
uprintf("\n");
|
||
#endif
|
||
|
||
record_index++;
|
||
}
|
||
|
||
if (!serialize_to_json_file("nicap_db.json", js_doc, true))
|
||
panic("Failed serializing JSON file");
|
||
|
||
uprintf("Processed %u records, skipping %u repeated records\n", record_index, num_repeated_recs);
|
||
|
||
return true;
|
||
}
|
||
|
||
void converters_init()
|
||
{
|
||
converters_test();
|
||
}
|
||
|
||
bool convert_nuk()
|
||
{
|
||
std::string title;
|
||
string_vec col_titles;
|
||
|
||
std::vector<string_vec> rows;
|
||
|
||
bool success = load_column_text("nuktest_usa.txt", rows, title, col_titles, false, "USA");
|
||
success = success && load_column_text("nuktest_ussr.txt", rows, title, col_titles, false, "USSR");
|
||
success = success && load_column_text("nuktest_britain.txt", rows, title, col_titles, false, "Britain");
|
||
success = success && load_column_text("nuktest_china.txt", rows, title, col_titles, false, "China");
|
||
success = success && load_column_text("nuktest_france.txt", rows, title, col_titles, false, "France");
|
||
success = success && load_column_text("nuktest_india.txt", rows, title, col_titles, false, "India");
|
||
success = success && load_column_text("nuktest_pakistan.txt", rows, title, col_titles, false, "Pakistan");
|
||
success = success && load_column_text("nuktest_unknown.txt", rows, title, col_titles, false, "Unknown");
|
||
if (!success)
|
||
panic("Failed loading nuk column text file");
|
||
|
||
enum { cColDate, cColTime, cColLat, cColLong, cColDepth, cColMb, cColY, cColYMax, cColType, cColName, cColSource, cColCountry, cColTotal };
|
||
|
||
ufo_timeline timeline;
|
||
|
||
uint32_t event_id = 0;
|
||
for (const string_vec& x : rows)
|
||
{
|
||
if (x.size() != cColTotal)
|
||
panic("Invalid # of cols");
|
||
|
||
timeline_event event;
|
||
if (!event.m_begin_date.parse(x[cColDate].c_str(), true))
|
||
panic("Failed parsing date");
|
||
|
||
event.m_date_str = event.m_begin_date.get_string();
|
||
|
||
if (x[cColTime] != "00:00:00.0")
|
||
event.m_time_str = x[cColTime];
|
||
|
||
event.m_type.push_back("atomic");
|
||
event.m_locations.push_back(x[cColLat] + " " + x[cColLong]);
|
||
|
||
std::string attr;
|
||
|
||
std::string t(string_upper(x[cColType]));
|
||
|
||
bool salvo = false;
|
||
if (string_ends_in(t, "_SALVO"))
|
||
{
|
||
salvo = true;
|
||
t = string_slice(t, 0, t.size() - 6);
|
||
}
|
||
|
||
if (t.size() == 1)
|
||
{
|
||
if (t[0] == 'A')
|
||
attr += "Air";
|
||
else if (t[0] == 'W')
|
||
attr += "Water";
|
||
else if (t[0] == 'U')
|
||
attr += "Underground";
|
||
else
|
||
panic("Unknown type");
|
||
}
|
||
else if ((t.size() == 4) || (t.size() == 5))
|
||
{
|
||
if (t.size() == 5)
|
||
{
|
||
if (t.back() != 'G')
|
||
panic("Invalid type");
|
||
}
|
||
else
|
||
{
|
||
//US(U), USSR(S), France(F), China(C), India(I), United Kingdom(G), or Pakistan(P)
|
||
|
||
const char c = t.back();
|
||
if ((c != 'U') && (c != 'S') && (c != 'F') && (c != 'C') && (c != 'I') && (c != 'G') && (c != 'P'))
|
||
panic("Invalid type");
|
||
}
|
||
|
||
if (t[0] != 'N')
|
||
panic("Not nuclear");
|
||
|
||
if (t[1] == 'A')
|
||
attr += "Air";
|
||
else if (t[1] == 'W')
|
||
attr += "Water";
|
||
else if (t[1] == 'U')
|
||
attr += "Underground";
|
||
else
|
||
panic("Unknown type");
|
||
|
||
if (t[2] == 'C')
|
||
attr += ", confirmed";
|
||
else if (t[2] == 'P')
|
||
attr += ", presumed";
|
||
else
|
||
panic("Unknown type");
|
||
|
||
if (salvo)
|
||
attr += ", salvo";
|
||
}
|
||
else
|
||
panic("Invalid type");
|
||
|
||
event.m_desc = string_format("Nuclear test: %s. Country: %s", attr.c_str(), x[cColCountry].c_str());
|
||
|
||
if ((x[cColName].size()) && (x[cColName] != "-"))
|
||
event.m_desc += string_format(u8" Name: <20>%s<>", x[cColName].c_str());
|
||
|
||
if (x[cColY].size())
|
||
event.m_desc += string_format(" Yield: %sKT", x[cColY].c_str());
|
||
|
||
if (x[cColYMax].size())
|
||
event.m_desc += string_format(" YieldMax: %sKT", x[cColYMax].c_str());
|
||
|
||
if (x[cColLat].size() && x[cColLong].size())
|
||
{
|
||
event.m_key_value_data.push_back(string_pair("LatLong", x[cColLat] + " " + x[cColLong]));
|
||
|
||
const double lat = atof(x[cColLat].c_str());
|
||
const double lon = atof(x[cColLong].c_str());
|
||
|
||
std::string latitude_dms = get_deg_to_dms(lat) + ((lat <= 0) ? " S" : " N");
|
||
std::string longitude_dms = get_deg_to_dms(lon) + ((lon <= 0) ? " W" : " E");
|
||
|
||
event.m_key_value_data.push_back(string_pair("LatLongDMS", latitude_dms + " " + longitude_dms));
|
||
}
|
||
|
||
if (x[cColDepth].size())
|
||
event.m_key_value_data.push_back(string_pair("NukeDepth", x[cColDepth]));
|
||
|
||
if (x[cColMb].size())
|
||
event.m_key_value_data.push_back(string_pair("NukeMb", x[cColMb]));
|
||
|
||
if (x[cColY].size())
|
||
event.m_key_value_data.push_back(string_pair("NukeY", x[cColY]));
|
||
|
||
if (x[cColYMax].size())
|
||
event.m_key_value_data.push_back(string_pair("NukeYMax", x[cColYMax]));
|
||
|
||
if (x[cColType].size())
|
||
event.m_key_value_data.push_back(string_pair("NukeType", x[cColType]));
|
||
|
||
if (x[cColName] != "-")
|
||
event.m_key_value_data.push_back(string_pair("NukeName", x[cColName]));
|
||
|
||
event.m_key_value_data.push_back(string_pair("NukeSource", x[cColSource]));
|
||
event.m_key_value_data.push_back(string_pair("NukeCountry", x[cColCountry]));
|
||
|
||
if (x[cColLat].size() && x[cColLong].size())
|
||
{
|
||
event.m_key_value_data.push_back(std::make_pair("LocationLink", string_format("[Google Maps](https://www.google.com/maps/place/%s,%s)", x[cColLat].c_str(), x[cColLong].c_str())));
|
||
}
|
||
|
||
event.m_refs.push_back("[\"columbia.edu: Worldwide Nuclear Explositions\", by Yang, North, Romney, and Richards](https://www.ldeo.columbia.edu/~richards/my_papers/WW_nuclear_tests_IASPEI_HB.pdf)");
|
||
event.m_refs.push_back("[\"archive.org: Worldwide Nuclear Explositions\", by Yang, North, Romney, and Richards](https://web.archive.org/web/20220407121213/https://www.ldeo.columbia.edu/~richards/my_papers/WW_nuclear_tests_IASPEI_HB.pdf)");
|
||
|
||
event.m_source = "NukeExplosions";
|
||
event.m_source_id = event.m_source + string_format("_%u", event_id);
|
||
|
||
timeline.get_events().push_back(event);
|
||
|
||
event_id++;
|
||
}
|
||
|
||
if (!timeline.get_events().size())
|
||
panic("Empty timeline)");
|
||
|
||
timeline.set_name("Nuclear Test Timeline");
|
||
|
||
return timeline.write_file("nuclear_tests.json", true);
|
||
}
|
||
|