ufo_data/converters.cpp
Richard Geldreich 650909370f new files
2023-02-20 17:59:08 -05:00

3169 lines
100 KiB
C++
Raw Blame History

// converters.cpp
// Copyright (C) 2023 Richard Geldreich, Jr.
#include "ufojson_core.h"
#include "markdown_proc.h"
#define USE_OPENAI (0)
// Escapes quoted strings.
static std::string escape_string_for_json(const std::string& str)
{
std::string new_str;
for (uint32_t i = 0; i < str.size(); i++)
{
char c = str[i];
if (c == '"')
new_str.push_back('\\');
else if (c == '\\')
new_str.push_back('\\');
else if (c == '\n')
{
new_str.push_back('\\');
new_str.push_back('n');
continue;
}
new_str.push_back(c);
}
return new_str;
}
bool convert_magnonia(const char* pSrc_filename, const char* pDst_filename, const char* pSource_override, const char* pRef_override)
{
string_vec lines;
if (!read_text_file(pSrc_filename, lines))
panic("Can't open file %s", pSrc_filename);
FILE* pOut_file = ufopen(pDst_filename, "w");
if (!pOut_file)
panic("Can't open file %s", pDst_filename);
fprintf(pOut_file, "{\n");
fprintf(pOut_file, "\"%s Timeline\" : [\n", pSource_override ? pSource_override : "Magnonia");
//const uint32_t TOTAL_RECS = 923;
const uint32_t TOTAL_COLS = 15;
uint32_t cur_line = 0;
uint32_t rec_index = 1;
while (cur_line < lines.size())
{
if (!lines[cur_line].size())
panic("Line %u is empty", cur_line);
int index = atoi(lines[cur_line++].c_str());
if (index != (int)rec_index)
panic("Unexpected index");
if (cur_line == lines.size())
panic("Out of lines");
std::string first_line(lines[cur_line++]);
std::string date_str(first_line);
if (date_str.size() > TOTAL_COLS)
date_str.resize(TOTAL_COLS);
string_trim(date_str);
if (first_line.size() < (TOTAL_COLS + 1))
{
if (cur_line == lines.size())
panic("Out of lines");
first_line = lines[cur_line++];
if (first_line.size() < (TOTAL_COLS + 1))
panic("Line too small");
}
string_vec desc_lines;
first_line.erase(0, TOTAL_COLS);
string_trim(first_line);
desc_lines.push_back(first_line);
std::string time_str;
for (; ; )
{
if (cur_line == lines.size())
break;
if (lines[cur_line].size() < TOTAL_COLS)
break;
std::string buf(lines[cur_line]);
if (desc_lines.size() == 1)
{
if (buf.size() >= TOTAL_COLS)
{
time_str = buf;
time_str.resize(TOTAL_COLS);
string_trim(time_str);
buf.erase(0, TOTAL_COLS);
}
}
string_trim(buf);
desc_lines.push_back(buf);
cur_line++;
}
std::string desc;
for (uint32_t j = 0; j < desc_lines.size(); j++)
{
if (desc.size() && desc.back() == '-')
{
// Don't trim '-' char if the previous char is a digit. This is probably imperfect.
if (!((desc.size() >= 2) && (isdigit((uint8_t)desc[desc.size() - 2]))))
desc.resize(desc.size() - 1);
}
else if (desc.size())
{
desc += " ";
}
desc += desc_lines[j];
}
std::string location;
size_t n = desc.find_first_of('.');
if (n == std::string::npos)
panic("Can't find . char");
location = desc;
location.resize(n);
string_trim(location);
size_t f = location.find_first_of('(');
size_t e = location.find_last_of(')');
if ((f != std::string::npos) && (e == location.size() - 1))
{
std::string state(location);
state.erase(0, f + 1);
if (state.size())
state.resize(state.size() - 1);
string_trim(state);
location.erase(f, location.size() - f);
string_trim(location);
location += ", ";
location += state;
}
desc.erase(0, n + 1);
string_trim(desc);
std::string ref;
f = desc.find_last_of('(');
e = desc.find_last_of(')');
if ((f != std::string::npos) && (e != std::string::npos))
{
if ((f < e) && (e == desc.size() - 1))
{
ref = desc.c_str() + f + 1;
if (ref.size())
ref.pop_back();
string_trim(ref);
desc.resize(f);
string_trim(desc);
}
}
int year = -1, month = -1, day = -1;
date_prefix_t date_prefix = cNoPrefix;
std::string temp_date_str(date_str);
if (string_begins_with(temp_date_str, "End "))
{
date_prefix = cEndOf;
temp_date_str.erase(0, strlen("End "));
string_trim(temp_date_str);
}
else if (string_begins_with(temp_date_str, "Early "))
{
date_prefix = cEarly;
temp_date_str.erase(0, strlen("Early "));
string_trim(temp_date_str);
}
f = date_str.find_first_of(',');
uint32_t m;
for (m = 0; m < 12; m++)
if (string_begins_with(temp_date_str, g_months[m]))
break;
if (m != 12)
{
month = m + 1;
temp_date_str.erase(0, strlen(g_months[m]));
string_trim(temp_date_str);
if (!temp_date_str.size())
panic("Failed parsing date");
f = temp_date_str.find_first_of(',');
if (f != std::string::npos)
{
if (f == 0)
{
temp_date_str.erase(0, 1);
string_trim(temp_date_str);
if (!isdigit(temp_date_str[0]))
panic("Failed parsing date");
year = atoi(temp_date_str.c_str());
if ((year <= 100) || (year > 2050))
panic("Failed parsing date");
}
else
{
day = atoi(temp_date_str.c_str());
temp_date_str.erase(0, f + 1);
string_trim(temp_date_str);
if (!isdigit(temp_date_str[0]))
panic("Failed parsing date");
year = atoi(temp_date_str.c_str());
if ((year <= 100) || (year > 2050))
panic("Failed parsing date");
}
}
else
{
if (!isdigit(temp_date_str[0]))
panic("Failed parsing date");
year = atoi(temp_date_str.c_str());
if ((year <= 100) || (year > 2050))
panic("Failed parsing date");
}
}
else
{
// The Magnonia data doesn't use the full range of prefixes we support.
if (string_begins_with(temp_date_str, "Summer,"))
{
date_prefix = cSummer;
temp_date_str.erase(0, strlen("Summer,"));
string_trim(temp_date_str);
}
else if (string_begins_with(temp_date_str, "Spring,"))
{
date_prefix = cSpring;
temp_date_str.erase(0, strlen("Spring,"));
string_trim(temp_date_str);
}
else if (string_begins_with(temp_date_str, "Fall,"))
{
date_prefix = cFall;
temp_date_str.erase(0, strlen("Fall,"));
string_trim(temp_date_str);
}
else if (string_begins_with(temp_date_str, "End "))
{
date_prefix = cEndOf;
temp_date_str.erase(0, strlen("End "));
string_trim(temp_date_str);
}
else if (string_begins_with(temp_date_str, "Early "))
{
date_prefix = cEarly;
temp_date_str.erase(0, strlen("Early "));
string_trim(temp_date_str);
}
if (!isdigit(temp_date_str[0]))
panic("Failed parsing date");
year = atoi(temp_date_str.c_str());
if ((year <= 100) || (year > 2050))
panic("Failed parsing date");
}
if (pRef_override)
{
if (ref.size())
ref += " ";
ref += pRef_override;
}
else
ref += " ([Vallee](https://web.archive.org/web/20120415100852/http://www.ufoinfo.com/magonia/magonia.shtml))";
//printf("## %u. Date: \"%s\" (%s %i/%i/%i), Time: \"%s\" Location: \"%s\", Ref: \"%s\"\n",
// i, date_str.c_str(), (date_prefix >= 0) ? g_date_prefix_strings[date_prefix] : "", month, day, year, time_str.c_str(), location.c_str(), ref.c_str());
//printf("%s\n", desc.c_str());
if (rec_index > 1)
fprintf(pOut_file, ",\n");
fprintf(pOut_file, "{\n");
fprintf(pOut_file, " \"date\" : \"");
if (date_prefix >= 0)
fprintf(pOut_file, "%s ", g_date_prefix_strings[date_prefix]);
if (month == -1)
fprintf(pOut_file, "%i", year);
else if (day == -1)
fprintf(pOut_file, "%i/%i", month, year);
else
fprintf(pOut_file, "%i/%i/%i", month, day, year);
fprintf(pOut_file, "\",\n");
fprintf(pOut_file, " \"desc\": \"%s\",\n", escape_string_for_json(desc).c_str());
if (location.size())
fprintf(pOut_file, " \"location\" : \"%s\",\n", escape_string_for_json(location).c_str());
if (time_str.size())
fprintf(pOut_file, " \"time\" : \"%s\",\n", time_str.c_str());
if (ref.size())
fprintf(pOut_file, " \"ref\": \"%s\",\n", escape_string_for_json(ref).c_str());
fprintf(pOut_file, " \"source_id\" : \"Magnonia_%u\",\n", rec_index);
fprintf(pOut_file, u8" \"source\" : \"%s\",\n", pSource_override ? pSource_override : u8"Vall<EFBFBD>eMagnonia");
fprintf(pOut_file, " \"type\" : \"ufo sighting\"\n");
fprintf(pOut_file, "}");
rec_index++;
}
fprintf(pOut_file, "\n] }\n");
fclose(pOut_file);
return true;
}
static bool invoke_openai(const std::string& prompt, std::string& reply)
{
reply.clear();
// Write prompt to i.txt
FILE* pFile = ufopen("i.txt", "wb");
fwrite(prompt.c_str(), prompt.size(), 1, pFile);
fclose(pFile);
// Invoke openai.exe
int status = system("openai.exe i.txt o.txt");
if (status != EXIT_SUCCESS)
return false;
// Read output file.
string_vec lines;
if (!read_text_file("o.txt", lines))
{
// Wait a bit and try again, rarely needed under Windows.
Sleep(50);
if (!read_text_file("o.txt", lines))
return false;
}
// Skip any blank lines at the beginning of the reply.
uint32_t i;
for (i = 0; i < lines.size(); i++)
{
std::string s(lines[i]);
string_trim(s);
if (s.size())
break;
}
for (; i < lines.size(); i++)
reply += lines[i];
return true;
}
bool convert_bluebook_unknowns()
{
string_vec lines;
if (!read_text_file("bb_unknowns.txt", lines))
panic("Can't read file bb_unknowns.txt");
uint32_t cur_line = 0;
uint32_t total_recs = 0;
FILE* pOut_file = ufopen("bb_unknowns.json", "w");
if (!pOut_file)
panic("Can't open output file bb_unknowns.json");
fputc(UTF8_BOM0, pOut_file);
fputc(UTF8_BOM1, pOut_file);
fputc(UTF8_BOM2, pOut_file);
fprintf(pOut_file, "{\n");
fprintf(pOut_file, "\"BlueBookUnknowns Timeline\" : [\n");
while (cur_line < lines.size())
{
std::string rec;
while (cur_line < lines.size())
{
std::string l(lines[cur_line]);
cur_line++;
string_trim(l);
if (!l.size())
break;
if (rec.size() && rec.back() == '-')
rec += l;
else
{
rec += ' ';
rec += l;
}
}
if (rec.size())
{
//printf("%u. %s\n", total_recs + 1, rec.c_str());
size_t semi_ofs = rec.find_first_of(';');
if (semi_ofs == std::string::npos)
panic("Unable to find initial ; char");
std::string date_str(rec);
date_str.resize(semi_ofs);
string_trim(date_str);
rec.erase(0, semi_ofs + 1);
string_trim(rec);
size_t period_ofs = rec.find_first_of(';');
if (period_ofs == std::string::npos)
{
period_ofs = rec.find_first_of('.');
if (period_ofs == std::string::npos)
panic("Unable to find . char");
if (((period_ofs >= 2) && (rec[period_ofs - 1] == 't') && (rec[period_ofs - 2] == 'F')) ||
((period_ofs >= 2) && (rec[period_ofs - 1] == 't') && (rec[period_ofs - 2] == 'M')))
{
period_ofs = rec.find('.', period_ofs + 1);
if (period_ofs == std::string::npos)
panic("Unable to find . char");
}
}
std::string location_str(rec);
location_str.resize(period_ofs);
string_trim(location_str);
rec.erase(0, period_ofs + 1);
string_trim(rec);
size_t time_period_ofs = rec.find_first_of('.');
if (time_period_ofs == std::string::npos)
panic("Unable to find , char");
std::string time_str(rec);
time_str.resize(time_period_ofs);
string_trim(time_str);
rec.erase(0, time_period_ofs + 1);
string_trim(rec);
//printf("Rec: %u\n", total_recs + 1);
//printf("Location: %s\n", location_str.c_str());
//printf("Time: %s\n", time_str.c_str());
//printf("Desc: %s\n", rec.c_str());
std::string json_date;
if ((string_begins_with(date_str, "Spring")) || (string_begins_with(date_str, "Summer")))
{
json_date = date_str;
}
else
{
size_t date_space = date_str.find_first_of(' ');
if (date_space == std::string::npos)
panic("Unable to find space char");
size_t date_comma = date_str.find_first_of(',');
int month;
if (string_begins_with(date_str, "Sept."))
month = 8;
else
{
for (month = 0; month < 12; month++)
{
if (string_begins_with(date_str, g_months[month]))
break;
if (string_begins_with(date_str, g_full_months[month]))
break;
}
if (month == 12)
panic("Failed finding month");
}
char buf[256];
int day, year;
if (date_comma == std::string::npos)
{
year = atoi(date_str.c_str() + date_space + 1);
if ((year < 1900) || (year > 1969))
panic("Invalid year");
sprintf_s(buf, "%u/%u", month + 1, year);
}
else
{
day = atoi(date_str.c_str() + date_space + 1);
if ((day < 1) || (day > 31))
panic("Invalid day");
year = atoi(date_str.c_str() + date_comma + 1);
if ((year < 1900) || (year > 1969))
panic("Invalid year");
sprintf_s(buf, "%u/%u/%u", month + 1, day, year);
}
json_date = buf;
}
//printf("JSON Date: %s\n", date_str.c_str());
//printf("Date: %s\n", json_date.c_str());
fprintf(pOut_file, "{\n");
fprintf(pOut_file, " \"date\" : \"%s\",\n", json_date.c_str());
fprintf(pOut_file, " \"time\" : \"%s\",\n", time_str.c_str());
fprintf(pOut_file, " \"location\" : \"%s\",\n", escape_string_for_json(location_str).c_str());
fprintf(pOut_file, " \"desc\" : \"%s\",\n", escape_string_for_json(rec).c_str());
fprintf(pOut_file, " \"source_id\" : \"BerlinerBBU_%u\",\n", total_recs);
fprintf(pOut_file, " \"source\" : \"BerlinerBBUnknowns\",\n");
fprintf(pOut_file, " \"ref\" : \"[BlueBook Unknowns PDF](https://github.com/richgel999/uap_resources/blob/main/bluebook_uncensored_unknowns_don_berliner.pdf)\",\n");
fprintf(pOut_file, " \"type\" : \"ufo sighting\"\n");
fprintf(pOut_file, "}");
if (cur_line < lines.size())
fprintf(pOut_file, ",");
fprintf(pOut_file, "\n");
total_recs++;
}
}
fprintf(pOut_file, "] }\n");
fclose(pOut_file);
return true;
}
static std::string convert_hall_to_json_date(const std::string& date_str)
{
std::string json_date;
if ((string_begins_with(date_str, "Spring")) ||
(string_begins_with(date_str, "Summer")) ||
(string_begins_with(date_str, "Late")) ||
(string_begins_with(date_str, "Early")))
{
json_date = date_str;
}
else
{
size_t date_space = date_str.find_first_of(' ');
if (date_space == std::string::npos)
panic("Unable to find space char");
size_t date_comma = date_str.find_first_of(',');
int month;
if (string_begins_with(date_str, "Sept."))
month = 8;
else
{
for (month = 0; month < 12; month++)
{
if (string_begins_with(date_str, g_months[month]))
break;
if (string_begins_with(date_str, g_full_months[month]))
break;
}
if (month == 12)
panic("Failed finding month");
}
char buf[256];
int day, year;
if (date_comma == std::string::npos)
{
year = atoi(date_str.c_str() + date_space + 1);
if ((year < 1900) || (year > 2000))
panic("Invalid year");
sprintf_s(buf, "%u/%u", month + 1, year);
}
else
{
day = atoi(date_str.c_str() + date_space + 1);
if ((day < 1) || (day > 31))
panic("Invalid day");
year = atoi(date_str.c_str() + date_comma + 1);
if ((year < 1900) || (year > 2000))
panic("Invalid year");
sprintf_s(buf, "%u/%u/%u", month + 1, day, year);
}
json_date = buf;
}
return json_date;
}
bool convert_hall()
{
string_vec lines;
if (!read_text_file("ufo_evidence_hall.txt", lines))
panic("Can't read file ufo_evidence_hall.txt");
uint32_t cur_line = 0;
uint32_t total_recs = 0;
FILE* pOut_file = ufopen("ufo_evidence_hall.json", "w");
if (!pOut_file)
panic("Can't open output file ufo_evidence_hall.json");
fputc(UTF8_BOM0, pOut_file);
fputc(UTF8_BOM1, pOut_file);
fputc(UTF8_BOM2, pOut_file);
fprintf(pOut_file, "{\n");
fprintf(pOut_file, "\"UFOEvidenceHall Timeline\" : [\n");
const uint32_t MAX_RECS = 2000;
while (cur_line < lines.size())
{
std::string rec(lines[cur_line]);
cur_line++;
string_trim(rec);
if (rec.empty())
panic("Encountered empty line");
size_t first_semi_ofs = rec.find_first_of(';');
if (first_semi_ofs == std::string::npos)
panic("Can't first first semi");
size_t second_semi_ofs = rec.find_first_of(';');
if (second_semi_ofs == std::string::npos)
panic("Can't find second semi");
while (cur_line < lines.size())
{
std::string l(lines[cur_line]);
string_trim(l);
if (!l.size())
panic("Encountered empty line");
size_t semi_ofs = l.find_first_of(';');
if (semi_ofs != std::string::npos)
break;
cur_line++;
if (rec.size() && rec.back() == '-')
rec += l;
else
{
rec += ' ';
rec += l;
}
}
if (rec.size())
{
//printf("%u. %s\n", total_recs + 1, rec.c_str());
std::string date_str(rec);
date_str.resize(first_semi_ofs);
string_trim(date_str);
rec.erase(0, first_semi_ofs + 1);
string_trim(rec);
second_semi_ofs = rec.find_first_of(';');
if (second_semi_ofs == std::string::npos)
panic("Can't find second semi");
std::string location_str(rec);
location_str.resize(second_semi_ofs);
string_trim(location_str);
if (location_str.size() && location_str.back() == '.')
location_str.pop_back();
rec.erase(0, second_semi_ofs + 1);
string_trim(rec);
//uprintf("Rec: %u\n", total_recs + 1);
//uprintf("Date: %s\n", date_str.c_str());
//uprintf("Location: %s\n", location_str.c_str());
//uprintf("Desc: %s\n", rec.c_str());
std::string json_date(convert_hall_to_json_date(date_str));
std::string json_end_date;
//uprintf("JSON Date: %s\n", date_str.c_str());
//uprintf("Date: %s\n", json_date.c_str());
if (string_begins_with(rec, "to "))
{
size_t dot_ofs = rec.find_first_of('.');
if (dot_ofs == std::string::npos)
panic("Invalid to date");
std::string to_date(rec);
to_date.resize(dot_ofs);
to_date.erase(0, 3);
string_trim(to_date);
rec.erase(0, dot_ofs + 1);
string_trim(rec);
json_end_date = convert_hall_to_json_date(to_date);
}
size_t k = rec.find_last_of('(');
if (k != std::string::npos)
{
if ((string_begins_with(rec.c_str() + k, "(Section ")) ||
(string_begins_with(rec.c_str() + k, "(section ")) ||
(string_begins_with(rec.c_str() + k, "(Sections ")) ||
(string_begins_with(rec.c_str() + k, "(sections ")))
{
rec.erase(k);
string_trim(rec);
}
}
#if USE_OPENAI
std::string prompt_str("Best categorize the following quoted text into one or more of these categories as a json array: sighting, landing, natural phenomenom, newspaper article, report or memo, official, abduction, medical, occupant or alien or creature encounter, or historical event: \"");
prompt_str += rec;
prompt_str += "\"";
std::string type_str;
bool status = invoke_openai(prompt_str, type_str);
if (!status)
panic("invoke_openai failed!\n");
for (size_t i = 0; i < type_str.size(); i++)
{
uint8_t c = type_str[i];
if ((c >= 32) && (c <= 127))
type_str[i] = (char)tolower(c);
}
#else
std::string type_str("[\"sighting\"]");
#endif
fprintf(pOut_file, "{\n");
fprintf(pOut_file, " \"date\" : \"%s\",\n", json_date.c_str());
if (json_end_date.size())
fprintf(pOut_file, " \"end_date\" : \"%s\",\n", json_end_date.c_str());
fprintf(pOut_file, " \"location\" : \"%s\",\n", escape_string_for_json(location_str).c_str());
fprintf(pOut_file, " \"desc\" : \"%s\",\n", escape_string_for_json(rec).c_str());
fprintf(pOut_file, " \"source_id\" : \"HallUFOE2_%u\",\n", total_recs);
fprintf(pOut_file, " \"source\" : \"HallUFOEvidence2\",\n");
fprintf(pOut_file, " \"ref\" : \"[The UFO Evidence by Richard H. Hall](https://www.amazon.com/UFO-Evidence-Richard-Hall/dp/0760706271)\",\n");
if (type_str.size())
fprintf(pOut_file, " \"type\" : %s\n", type_str.c_str());
fprintf(pOut_file, "}");
if (cur_line < lines.size())
fprintf(pOut_file, ",");
fprintf(pOut_file, "\n");
#if USE_OPENAI
fflush(pOut_file);
#endif
total_recs++;
if (total_recs == MAX_RECS)
break;
}
}
fprintf(pOut_file, "] }\n");
fclose(pOut_file);
return true;
}
static const char* g_bad_urls[] =
{
"https://www.thelivingmoon.com/41pegasus/12insiders/McDonnell%5FDouglas%5FUFO%5FStudies.html",
"https://ufo.com.br/artigos/relatos---piloto-se-arrisca-em-prova-de-fogo-no-parana.html",
"http://boblazardebunked.com/investigating-s4-e115/",
"http://boblazardebunked.com/real-area-51-tech/",
"https://history.nebraska.gov/sites/history.nebraska.gov/files/doc/publications/NH1979UFOs.pdf",
"http://alienhunter.org/about/becoming-the-alien-hunter/",
"http://brumac.8k.com/ChristmasTree/",
"http://dl.lilibook.ir/2016/03/Other-Tongues-Other-Flesh.pdf",
"http://documents.irevues.inist.fr/bitstream/handle/2042/51980/meteo%5F1995%5F11%5F8.pdf",
"http://files.afu.se/Downloads/Magazines/United%20Kingdom/%20Merseyside%20UFO%20Bulletin/Merseyside%20UFO%20Bulletin%20-%20Vol%201%20No%201%20-%201968.pdf",
"http://ufologie.patrickgross.org/1954/19oct1954criteuil.htm",
"http://www.bluebookarchive.org/page.aspx?PageCode=NARA-PBB85-812",
"http://www.bluebookarchive.org/page.aspx?PageCode=NARA-PBB85-813",
"http://www.bluebookarchive.org/page.aspx?PageCode=NARA-PBB85-816",
"http://www.bluebookarchive.org/page.aspx?PageCode=NARA-PBB92-607",
"http://www.bluebookarchive.org/page.aspx?pagecode=NARA-PBB90-354",
"http://www.cheniere.org/",
"http://www.historycommons.org/entity.jsp?entity=james%5Fo%5F%5Fconnell",
"http://www.islandone.org/LEOBiblio/SETI1.HTM",
"http://www.nicap.org/france74.gif",
"http://www.phils.com.au/hollanda.htm",
"http://www.spellconsulting.com/reality/Norway%5FSpiral.html",
"http://www.wsmr-history.org/HallOfFame52.htm",
"http://www.zanoverallsongs.com/bio/",
"https://area51specialprojects.com/area51%5Ftimeline.html",
"https://area51specialprojects.com/u2%5Fpilots.html",
"https://cieloinsolito.com/?p=724",
"https://cieloinsolito.com/wp-content/uploads/2019/09/heretheyare.pdf",
"https://clubdeleonescuernavacaac.club/1965-1966-joaquin-diaz-gonzalez/",
"https://dpo.tothestarsacademy.com/blog/crada-faq",
"https://files.afu.se/Downloads/Magazines/Switzerland/Weltraumbote/Weltraumbote%20-%20No%2001%20-%201955-1956.pdf",
"https://files.afu.se/Downloads/UFO%20reports/Scandinavia/GR%20translations/46-02-21%20Finland%20meteor/1946-02-21%20Report%20Meteor%20over%20Finland%20and%20east%20Sweden.docx",
"https://mauriziobaiata.net/2011/11/10/intervista-ad-eugenio-siragusa-1919-2006-la-verita-non-si-vende-e-non-si-compra/",
"https://psi-encyclopedia.spr.ac.uk/articles/helene-smith",
"https://trinitysecret.com/the-other-lessons-of-trinity/",
"https://ufologie.patrickgross.org/1954/17oct1954cier.htm",
"https://ufologie.patrickgross.org/1954/1oct1954bry.htm",
"https://ufologie.patrickgross.org/1954/4oct1954poncey.htm",
"https://ufoscoop.com/richard-c-doty/",
"https://worldhistoryproject.org/1948/12/20/project-twinkle-established-to-monitor-green-fireball-sightings",
"https://www.abqjournal.com/obits/profiles/0722057profiles11-07-09.htm",
"https://www.alternatewars.com/BBOW/ABC%5FWeapons/US%5FNuclear%5FStockpile.htm",
"https://www.amberley-books.com/community-james-p-templeton",
"https://www.barnesandnoble.com/w/creative-realism-rolf-alexander/1132618455",
"https://www.charlotteobserver.com/latest-news/article125658529.html",
"https://www.hotspotsz.com/former-reporter-recounts-ufo-tale/",
"https://www.modrall.com/attorney/r-e-thompson/",
"https://www.mondenouveau.fr/presence-extraterrestre-ummo-une-imposture-ou-pas/",
"https://www.ourstrangeplanet.com/the-san-luis-valley/guest-editorials/mad-cow-disease-and-cattle-mutilations/",
"https://www.realclearscience.com/articles/2017/12/02/how%5F17th%5Fcentury%5Fdreamers%5Fplanned%5Fto%5Freach%5Fthe%5Fmoon%5F110476.html",
"https://www.rhun.co.nz/files/cia/cia1/44%5Fcia%5Fciaall2.pdf",
"https://www.spacelegalissues.com/the-french-anti-ufo-municipal-law-of-1954/",
"https://www.thevoicebeforethevoid.net/incidentcomplaint-report-by-commander-44-missile-security-squadron-ellsworth-air-force-base-south-dakota/",
"https://www.uapsg.com/2020/07/argentina-ufo-declassification.html",
"https://www.webcitation.org/6mx4huFGk",
"https://www.webcitation.org/6mx4rfU20",
"https://www.webcitation.org/6mx5Youbh",
"https://zazenlife.com/2011/12/18/project-mk-ultra-the-c-i-as-experiments-with-mind-control/"
};
const uint32_t NUM_BAD_URLS = sizeof(g_bad_urls) / sizeof(g_bad_urls[0]);
static std::string fix_bar_urls(const std::string& url)
{
for (uint32_t i = 0; i < NUM_BAD_URLS; i++)
if (url == g_bad_urls[i])
return "https://web.archive.org/web/100/" + url;
return url;
}
bool convert_eberhart(unordered_string_set& unique_urls)
{
string_vec lines;
if (!read_text_file("ufo1_199.md", lines))
panic("Can't read file ufo_evidence_hall.txt");
if (!read_text_file("ufo200_399.md", lines))
panic("Can't read file ufo_evidence_hall.txt");
if (!read_text_file("ufo400_599.md", lines))
panic("Can't read file ufo_evidence_hall.txt");
if (!read_text_file("ufo600_906_1.md", lines))
panic("Can't read file ufo_evidence_hall.txt");
if (!read_text_file("ufo600_906_2.md", lines))
panic("Can't read file ufo_evidence_hall.txt");
string_vec trimmed_lines;
for (uint32_t i = 0; i < lines.size(); i++)
{
std::string s(lines[i]);
string_trim(s);
if (s.size())
trimmed_lines.push_back(s);
}
lines.swap(trimmed_lines);
uint32_t cur_line = 0;
string_vec new_lines;
while (cur_line < lines.size())
{
std::string line(lines[cur_line]);
bool all_digits = true;
for (uint32_t i = 0; i < line.size(); i++)
{
if (!isdigit((uint8_t)line[i]))
{
all_digits = false;
break;
}
}
if (line == "* * *")
{
}
else if (all_digits && ((line.size() == 3) || (line.size() == 4)))
{
char buf[256];
sprintf_s(buf, "# %s", line.c_str());
new_lines.push_back(buf);
}
else if (line == "#")
{
if ((cur_line + 1) >= lines.size())
panic("Out of lines");
if (!isdigit((char)lines[cur_line + 1][0]))
panic("Can't fine year");
char buf[256];
sprintf_s(buf, "# %s", lines[cur_line + 1].c_str());
new_lines.push_back(buf);
cur_line++;
}
else if ((line.size() >= 5) && (line[0] == '*') && (line[1] == '*'))
{
uint32_t year = atoi(line.c_str() + 2);
if ((year < 800) || (year > 2050))
panic("Invalid year");
char buf[256];
sprintf_s(buf, "# %u", year);
new_lines.push_back(buf);
}
else
new_lines.push_back(line);
cur_line++;
}
lines.swap(new_lines);
//write_text_file("temp.txt", lines);
FILE* pOut_file = ufopen("eberhart.json", "w");
if (!pOut_file)
panic("Can't open output file eberhart.json");
fputc(UTF8_BOM0, pOut_file);
fputc(UTF8_BOM1, pOut_file);
fputc(UTF8_BOM2, pOut_file);
fprintf(pOut_file, "{\n");
fprintf(pOut_file, "\"Eberhart Timeline\" : [\n");
int cur_year = -1;
cur_line = 0;
uint32_t event_num = 0, total_unattributed = 0;
while (cur_line < lines.size())
{
std::string line(lines[cur_line]);
cur_line++;
if (!line.size())
continue;
if (line[0] == '#')
{
int year = atoi(line.c_str() + 1);
if ((year < 100) || (year > 2050))
panic("Invalid year");
cur_year = year;
continue;
}
size_t dash_pos = line.find(u8"<EFBFBD>");
if (dash_pos == std::string::npos)
panic("Failed finding dash\n");
std::string date(line);
date.erase(dash_pos, date.size());
string_trim(date);
if (!date.size())
panic("Date too small");
line.erase(0, dash_pos + 3);
string_trim(line);
if (!line.size())
panic("Line too small");
event_date begin_date;
event_date end_date;
event_date alt_date;
bool year_status = event_date::parse_eberhart_date_range(date, begin_date, end_date, alt_date, cur_year);
if (!year_status)
panic("Date parse failed");
std::string desc(line);
while (cur_line < lines.size())
{
std::string temp(lines[cur_line]);
if (temp[0] == '#')
break;
size_t d = temp.find(u8"<EFBFBD>");
const uint32_t DASH_THRESH_POS = 42;
if ((d != std::string::npos) && (d < DASH_THRESH_POS))
{
std::string temp_date(temp);
temp_date.erase(d, temp_date.size());
string_trim(temp_date);
event_date b, e, a;
bool spec_year_status = event_date::parse_eberhart_date_range(temp_date, b, e, a, cur_year);
if (spec_year_status)
break;
}
if ((desc.size()) && (desc.back() != '-'))
desc += ' ';
if (!lines[cur_line].size())
panic("Unexpected empty line");
desc += lines[cur_line];
cur_line++;
}
std::string ref;
{
markdown_text_processor mt;
mt.init_from_markdown(desc.c_str());
mt.fix_redirect_urls();
for (uint32_t i = 0; i < mt.m_links.size(); i++)
unique_urls.insert(mt.m_links[i]);
for (auto& str : mt.m_links)
str = fix_bar_urls(str);
markdown_text_processor a, b;
bool status = mt.split_last_parens(a, b);
if (!status)
{
uprintf("Failing text: %s\n", desc.c_str());
panic("Unable to find attribution in record %u", event_num);
}
std::string a_md, b_md;
desc.resize(0);
a.convert_to_markdown(desc, true);
b.convert_to_markdown(ref, true);
if (ref == "\\(\\)")
{
ref.clear();
total_unattributed++;
}
else if (ref.size() >= 4)
{
if ((ref[0] == '\\') &&
(ref[1] == '(') &&
(ref.back() == ')') &&
(ref[ref.size() - 2] == '\\')
)
{
ref.erase(0, 2);
ref.pop_back();
ref.pop_back();
}
}
}
#if 0
uprintf("## Event %u\n", event_num);
uprintf("Date: %s \n", date.c_str());
uprintf("(%i %i/%i/%i)-(%i %i,%i,%i) alt: (%i %i,%i,%i) \n",
begin_date.m_prefix, begin_date.m_month, begin_date.m_day, begin_date.m_year,
end_date.m_prefix, end_date.m_month, end_date.m_day, end_date.m_year,
alt_date.m_prefix, alt_date.m_month, alt_date.m_day, alt_date.m_year);
uprintf("%s \n\n", desc.c_str());
#endif
fprintf(pOut_file, "{\n");
std::string json_date(begin_date.get_string()), json_end_date(end_date.get_string()), json_alt_date(alt_date.get_string());
fprintf(pOut_file, " \"date\" : \"%s\",\n", json_date.c_str());
if (json_end_date.size())
fprintf(pOut_file, " \"end_date\" : \"%s\",\n", json_end_date.c_str());
if (json_alt_date.size())
fprintf(pOut_file, " \"alt_date\" : \"%s\",\n", json_alt_date.c_str());
fprintf(pOut_file, " \"desc\" : \"%s\",\n", escape_string_for_json(desc).c_str());
fprintf(pOut_file, " \"source_id\" : \"Eberhart_%u\",\n", event_num);
fprintf(pOut_file, " \"source\" : \"EberhartUFOI\",\n");
if (!ref.size())
{
fprintf(pOut_file, " \"ref\" : \"[Eberhart](http://www.cufos.org/pdfs/UFOsandIntelligence.pdf)\"\n");
}
else
{
fprintf(pOut_file, " \"ref\" : [ \"[Eberhart](http://www.cufos.org/pdfs/UFOsandIntelligence.pdf)\", \"%s\" ]\n", escape_string_for_json(ref).c_str());
}
fprintf(pOut_file, "}");
if (cur_line < lines.size())
fprintf(pOut_file, ",");
fprintf(pOut_file, "\n");
event_num++;
}
fprintf(pOut_file, "] }\n");
fclose(pOut_file);
uprintf("Total records: %u\n", event_num);
uprintf("Total unattributed: %u\n", total_unattributed);
return true;
}
bool convert_johnson()
{
static const uint32_t days_in_month[] = { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 };
FILE* pCombined_file = nullptr;
#if 0
pCombined_file = ufopen("combined.txt", "wb");
if (!pCombined_file)
panic("Can't open combined.txt");
fputc(UTF8_BOM0, pCombined_file);
fputc(UTF8_BOM1, pCombined_file);
fputc(UTF8_BOM2, pCombined_file);
#endif
FILE* pOut_file = ufopen("johnson.json", "w");
if (!pOut_file)
panic("Can't open output file johnson.json");
fputc(UTF8_BOM0, pOut_file);
fputc(UTF8_BOM1, pOut_file);
fputc(UTF8_BOM2, pOut_file);
fprintf(pOut_file, "{\n");
fprintf(pOut_file, "\"Johnson Timeline\" : [\n");
string_vec combined_lines;
uint32_t total_recs = 0;
for (uint32_t month = 0; month < 12; month++)
{
for (uint32_t day = 0; day < days_in_month[month]; day++)
{
char buf[256];
sprintf_s(buf, "Johnson/%s%02u.txt", g_full_months[month], day + 1);
string_vec lines;
bool utf8_flag = false;
bool status = read_text_file(buf, lines, true, &utf8_flag);
if (!status)
panic("Can't open file %s\n", buf);
uprintf("Read file %s %u\n", buf, utf8_flag);
string_vec filtered_lines;
bool found_end = false;
for (uint32_t line_index = 0; line_index < lines.size(); line_index++)
{
if ((lines[line_index].size() >= 2) && (lines[line_index][0] == '|') && (lines[line_index].back() == '|'))
{
std::string l(lines[line_index]);
l.erase(0, 1);
l.pop_back();
// Convert non-break spaces to spaces
std::string new_l;
for (uint32_t i = 0; i < l.size(); i++)
{
uint8_t c = (uint8_t)l[i];
if ((c == 0xC2) && ((i + 1) < l.size()) && ((uint8_t)l[i + 1] == 0xA0))
{
new_l.push_back(' ');
i++;
}
// EN DASH
else if ((c == 0xE2) && ((i + 2) < l.size()) && ((uint8_t)l[i + 1] == 0x80) && ((uint8_t)l[i + 2] == 0x93))
{
new_l.push_back('-');
i += 2;
}
else
new_l.push_back(l[i]);
}
l.swap(new_l);
string_trim(l);
// See if we're at the end of the page
if ((string_find_first(l, "Written by Donald A. Johnson") != -1) ||
(string_find_first(l, "Written by Donald Johnson") != -1) ||
(string_find_first(l, "Written by Donald A Johnson") != -1) ||
(string_find_first(l, "Compiled from the UFOCAT computer database") != -1) ||
(string_find_first(l, u8"<EFBFBD> Donald A. Johnson") != -1) ||
(string_begins_with(l, "Themes: ")))
{
found_end = true;
break;
}
filtered_lines.push_back(l);
}
}
lines.swap(filtered_lines);
if (!found_end)
panic("Couldn't find end");
if (lines.size() < 6)
panic("File too small");
if ((lines[0] != "[On This Day]") || (string_find_first(lines[1], "Encounters with Aliens on this Day") == -1))
panic("Couldn't find beginning");
lines.erase(lines.begin(), lines.begin() + 5);
//for (uint32_t i = 0; i < lines.size(); i++)
// uprintf("%04u: \"%s\"\n", i, lines[i].c_str());
//uprintf("\n");
string_vec new_lines;
for (uint32_t i = 0; i < lines.size(); i++)
{
if (lines[i].size() &&
(lines[i].back() == ']') &&
((string_find_first(lines[i], "[") == -1) || (lines[i].front() == '[')))
{
int back_delta_pos = -1;
for (int back_delta = 0; back_delta <= 3; back_delta++)
{
if ((int)i < back_delta)
break;
if (lines[i - back_delta].size() && lines[i - back_delta].front() == '[')
{
back_delta_pos = i - back_delta;
break;
}
}
bool find_extra_text = true;
if (back_delta_pos == -1)
{
for (int back_delta = 0; back_delta <= 3; back_delta++)
{
if ((int)i < back_delta)
break;
if (string_find_first(lines[i - back_delta], "[") != -1)
{
back_delta_pos = i - back_delta;
break;
}
}
if (back_delta_pos == -1)
panic("Can't find back delta");
find_extra_text = false;
}
if ((back_delta_pos < 2) || lines[back_delta_pos - 1].size())
{
new_lines.push_back(lines[i]);
continue;
}
if (find_extra_text)
{
back_delta_pos -= 2;
while (back_delta_pos && lines[back_delta_pos].size())
{
back_delta_pos--;
}
}
uint32_t total_lines_to_erase = (i + 1) - (back_delta_pos + 1);
//for (uint32_t j = 0; j < total_lines_to_erase; j++)
// uprintf("\"%s\"\n", new_lines[(new_lines.size() - total_lines_to_erase) + j].c_str());
new_lines.erase(new_lines.begin() + (new_lines.size() - total_lines_to_erase), new_lines.end());
}
else
{
new_lines.push_back(lines[i]);
}
}
lines.swap(new_lines);
#if 0
for (uint32_t i = 0; i < lines.size(); i++)
{
fwrite(lines[i].c_str(), lines[i].size(), 1, pCombined_file);
fwrite("\r\n", 2, 1, pCombined_file);
}
#endif
// Sanity checks
for (uint32_t i = 0; i < lines.size(); i++)
{
const std::string& line = lines[i];
if (line.size() < 7)
continue;
if (line.find_first_of('|') != std::string::npos)
{
panic("Bad line");
}
if (isdigit((uint8_t)line[0]) &&
isdigit((uint8_t)line[1]) &&
isdigit((uint8_t)line[2]) &&
isdigit((uint8_t)line[3]) &&
(line[4] == ',') && (i >= 1) && (lines[i - 1].size() == 0))
{
panic("Bad line");
}
if (string_begins_with(line, "In ") &&
(i >= 1) && (lines[i - 1].size() == 0))
{
panic("Bad line");
}
if ((string_begins_with(line, "On the ") ||
string_begins_with(line, "On this ") ||
string_begins_with(line, "That same ") ||
string_begins_with(line, "At dusk ") ||
string_begins_with(line, "At dawn ") ||
string_begins_with(line, "There were ") ||
string_begins_with(line, "A UFO was seen ") || string_begins_with(line, "An abduction occurred ") ||
string_begins_with(line, "Also in ") ||
string_begins_with(line, "There were a ") ||
(string_begins_with(line, "At ") && line.size() >= 4 && isdigit((uint8_t)line[3]))) &&
(i >= 1) && (lines[i - 1].size() == 0))
{
panic("Bad line");
}
if (isdigit((uint8_t)line[0]) &&
isdigit((uint8_t)line[1]) &&
isdigit((uint8_t)line[2]) &&
isdigit((uint8_t)line[3]) &&
(line[4] == ' ') &&
(line[5] == '-') &&
(line[6] != ' '))
{
panic("Bad line");
}
if (isdigit((uint8_t)line[0]) &&
isdigit((uint8_t)line[1]) &&
isdigit((uint8_t)line[2]) &&
isdigit((uint8_t)line[3]) &&
(line[4] == '-') &&
(line[5] == ' '))
{
panic("Bad line");
}
}
uint32_t cur_line = 0;
int prev_year = -1;
while (cur_line < lines.size())
{
std::string first_line = lines[cur_line];
if (first_line.size() < 8)
panic("First line in block too small");
if (!isdigit((uint8_t)first_line[0]) ||
!isdigit((uint8_t)first_line[1]) ||
!isdigit((uint8_t)first_line[2]) ||
!isdigit((uint8_t)first_line[3]) ||
(first_line[4] != ' ') ||
(first_line[5] != '-') ||
(first_line[6] != ' '))
{
panic("Bad begin block");
}
int year = atoi(first_line.c_str());
if ((year < 1000) || (year > 2020))
panic("Invalid year");
if (year < prev_year)
panic("Year went backwards");
prev_year = year;
first_line.erase(0, 7);
std::string record_text(first_line);
cur_line++;
while (cur_line < lines.size())
{
const std::string& next_line = lines[cur_line];
if (isdigit((uint8_t)next_line[0]) &&
isdigit((uint8_t)next_line[1]) &&
isdigit((uint8_t)next_line[2]) &&
isdigit((uint8_t)next_line[3]) &&
(next_line[4] == ' ') &&
(next_line[5] == '-') &&
(next_line[6] == ' '))
{
break;
}
if (record_text.size())
{
if (next_line.size() == 0)
record_text += "<br/><br/>";
else if (record_text.back() != '-')
record_text.push_back(' ');
}
record_text += next_line;
cur_line++;
}
if (string_ends_in(record_text, "<br/><br/>"))
{
record_text.erase(record_text.begin() + record_text.size() - 10, record_text.end());
}
string_trim(record_text);
if (pCombined_file)
{
char buf2[256];
sprintf_s(buf2, "Record %u, year %i:\r\n", total_recs, year);
fwrite(buf2, strlen(buf2), 1, pCombined_file);
fwrite(&record_text[0], record_text.size(), 1, pCombined_file);
fwrite("\r\n", 2, 1, pCombined_file);
}
std::string ref;
// Extract reference(s) at end of record.
size_t src_pos = record_text.find("(Source:");
if (src_pos == std::string::npos)
src_pos = record_text.find("(Sources:");
if ((src_pos != std::string::npos) &&
((record_text.back() == ')') || ((record_text.back() == '.') && (record_text[record_text.size() - 2] == ')')))
)
{
ref = record_text;
ref.erase(0, src_pos);
if (ref.back() == '.')
ref.pop_back();
assert(ref.back() == ')');
ref.pop_back();
assert(ref[0] == '(');
ref.erase(0, 1);
if (string_begins_with(ref, "Source:"))
ref.erase(0, 7);
else
ref.erase(0, 8);
string_trim(ref);
record_text.erase(src_pos, record_text.size() - src_pos);
}
else
{
//uprintf("%s\n\n", record_text.c_str());
}
if (total_recs)
fprintf(pOut_file, ",\n");
fprintf(pOut_file, "{\n");
char buf2[256];
sprintf_s(buf2, "%u/%u/%u", month + 1, day + 1, year);
fprintf(pOut_file, " \"date\" : \"%s\",\n", buf2);
fprintf(pOut_file, " \"desc\" : \"%s\",\n", escape_string_for_json(record_text).c_str());
fprintf(pOut_file, " \"source_id\" : \"Johnson_%u\",\n", total_recs);
fprintf(pOut_file, " \"source\" : \"Johnson\",\n");
if (ref.size())
{
fprintf(pOut_file, " \"ref\" : [ \"[Johnson](https://web.archive.org/web/http://www.ufoinfo.com/onthisday/calendar.html)\", \"%s\" ]\n",
escape_string_for_json(ref).c_str());
}
else
fprintf(pOut_file, " \"ref\" : \"[Johnson](https://web.archive.org/web/http://www.ufoinfo.com/onthisday/calendar.html)\"\n");
fprintf(pOut_file, "}");
total_recs++;
}
if (pCombined_file)
fflush(pCombined_file);
}
}
fprintf(pOut_file, "\n] }\n");
fclose(pOut_file);
printf("Total records: %u\n", total_recs);
if (pCombined_file)
fclose(pCombined_file);
return true;
}
static bool test_eberhart_date()
{
event_date b, e, a;
if (!event_date::parse_eberhart_date_range("Late summer", b, e, a, 1970)) return false;
assert(b.m_prefix == cLateSummer && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("Early winter", b, e, a, 1970)) return false;
assert(b.m_prefix == cEarlyWinter && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("Mid autumn", b, e, a, 1970)) return false;
assert(b.m_prefix == cMidAutumn && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("Mid autumn 1970 (or Late summer)", b, e, a, 1970)) return false;
assert(b.m_prefix == cMidAutumn && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cLateSummer && a.m_year == 1970 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("Mid-December", b, e, a, 1970)) return false;
assert(b.m_prefix == cMiddleOf && b.m_year == 1970 && b.m_month == 12 && b.m_day == -1);
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("December 16?", b, e, a, 1970)) return false;
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 12 && b.m_day == 16 && b.m_fuzzy);
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("Fall 1970", b, e, a, 1970)) return false;
assert(b.m_prefix == cFall && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("January 27, 1970", b, e, a, 1970)) return false;
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 1 && b.m_day == 27);
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("January 1970", b, e, a, 1970)) return false;
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 1 && b.m_day == -1);
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("March-June", b, e, a, 1970)) return false;
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 3 && b.m_day == -1);
assert(e.m_prefix == cNoPrefix && e.m_year == 1970 && e.m_month == 6 && e.m_day == -1);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("Summer or Fall", b, e, a, 1970)) return false;
assert(b.m_prefix == cSummer && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cFall && a.m_year == 1970 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("Summer-Fall or Winter", b, e, a, 1970)) return false;
assert(b.m_prefix == cSummer && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
assert(e.m_prefix == cFall && e.m_year == 1970 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cWinter && a.m_year == 1970 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("January 5 or March 6", b, e, a, 1970)) return false;
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 1 && b.m_day == 5);
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cNoPrefix && a.m_year == 1970 && a.m_month == 3 && a.m_day == 6);
if (!event_date::parse_eberhart_date_range("Late January 1970-Summer 1971", b, e, a, 1970)) return false;
assert(b.m_prefix == cLate && b.m_year == 1970 && b.m_month == 1 && b.m_day == -1);
assert(e.m_prefix == cSummer && e.m_year == 1971 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("January 5 or 6", b, e, a, 1970)) return false;
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 1 && b.m_day == 5);
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cNoPrefix && a.m_year == 1970 && a.m_month == 1 && a.m_day == 6);
if (!event_date::parse_eberhart_date_range("January 20-25", b, e, a, 1970)) return false;
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 1 && b.m_day == 20);
assert(e.m_prefix == cNoPrefix && e.m_year == 1970 && e.m_month == 1 && e.m_day == 25);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("Summer", b, e, a, 1970)) return false;
assert(b.m_prefix == cSummer && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("Summer-Winter", b, e, a, 1970)) return false;
assert(b.m_prefix == cSummer && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
assert(e.m_prefix == cWinter && e.m_year == 1970 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("Early", b, e, a, 1970)) return false;
assert(b.m_prefix == cEarly && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("1970", b, e, a, 1970)) return false;
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == -1 && b.m_day == -1);
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("January 20-February 25", b, e, a, 1970)) return false;
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 1 && b.m_day == 20);
assert(e.m_prefix == cNoPrefix && e.m_year == 1970 && e.m_month == 2 && e.m_day == 25);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("February 25, 1970", b, e, a, 1970)) return false;
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 2 && b.m_day == 25);
assert(e.m_prefix == cNoPrefix && e.m_year == -1 && e.m_month == -1 && e.m_day == -1);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("January 20-February 25, 1971", b, e, a, 1970)) return false;
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 1 && b.m_day == 20);
assert(e.m_prefix == cNoPrefix && e.m_year == 1971 && e.m_month == 2 && e.m_day == 25);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
if (!event_date::parse_eberhart_date_range("January 20 1970-February 25, 1971", b, e, a, 1970)) return false;
assert(b.m_prefix == cNoPrefix && b.m_year == 1970 && b.m_month == 1 && b.m_day == 20);
assert(e.m_prefix == cNoPrefix && e.m_year == 1971 && e.m_month == 2 && e.m_day == 25);
assert(a.m_prefix == cNoPrefix && a.m_year == -1 && a.m_month == -1 && a.m_day == -1);
// These should all fail
if (event_date::parse_eberhart_date_range("January 20q 1970-February 25, 1971", b, e, a, 1970)) return false;
if (event_date::parse_eberhart_date_range("Januaryq 20 1970-February 25, 1971", b, e, a, 1970)) return false;
if (event_date::parse_eberhart_date_range("", b, e, a, 1970)) return false;
if (event_date::parse_eberhart_date_range(" ", b, e, a, 1970)) return false;
if (event_date::parse_eberhart_date_range(" , ", b, e, a, 1970)) return false;
return true;
}
static void print_nocr(const std::string& s)
{
std::string new_string;
for (uint32_t i = 0; i < s.size(); i++)
{
if ((s[i] != '\n') && (s[i] != 1))
new_string.push_back(s[i]);
else
new_string.push_back(' ');
}
uprintf("%s", new_string.c_str());
}
//-------------------------------------------------------------------
static void converters_test()
{
std::string blah;
blah.push_back(ANSI_SOFT_HYPHEN);
#if 0
// should print a dash (code page 1252 - ANSI Latin 1)
putc((char)ANSI_SOFT_HYPHEN, stdout);
// should print a dash
uprintf("%s\n", wchar_to_utf8(utf8_to_wchar(blah, CP_ACP)).c_str());
#endif
//fprintf(u8"<22>frightening vision<6F>");
//ufprintf(stderr, u8"<22>frightening vision<6F>");
assert(crc32((const uint8_t*)"TEST", 4) == 0xeeea93b8);
assert(crc32((const uint8_t*)"408tdsfjdsfjsdh893!;", 20) == 0xa044e016);
if (!test_eberhart_date()) return panic("test_eberhart_date failed!");
// rg hack hack
#if 0
//const char *p = "_Hello, [world](http://www.google.com)_ <br/><br/>This is a _test._ **COOL**\nBlah Blah\nZA ZB ZC \nZD ZE EF\nHDR\nThis is a test\n\nPara 1\n\nPara 2";
//const char* p = "Hello, [**world**](http://www.google.com). \nThis is a test.\n\nNew \\*paragraph\\*.";
//const char* p = "<br/><br/>[_B_](WWW.A.COM) **[C](WWW.B.COM)**<br/><br/>This is a test<br/><br/>Blah \nBlah \n\nNew (This is a test!).";
//bufprintf(pIn, "A\nB \nC\n_This is a blah_[XXXX](YYYY(S))");
//const char* p = u8R"(Chemist [Gustaf Ljunggren](https://www.google.com/url?q=https://en.wikipedia.org/wiki/Gustaf_Ljunggren_(chemist)&sa=D&source=editors&ust=1674889728009134&usg=AOvVaw2v_Cymx15I5Ic1eNEYeeBr)<29>of the Swedish National Defense Research Institute summarizes for the Swedish Defense staff his analysis of 27 finds of mysterious substances, allegedly from ghost rockets. None are radioactive and all have mundane explanations. (Anders Liljegren and Clas Svahn, <20>The Ghost Rockets,<2C> UFOs 1947<34>1987, Fortean Tomes, 1987, pp. 33<33>34))";
// const char* p = u8R"(Blah
//English clergyman and philosopher [_John Wilkins_](https://www.google.com/url?q=https://en.wikipedia.org/wiki/John_Wilkins&sa=D&source=editors&ust=1674889727243386&usg=AOvVaw1hw56rPPqRvDJzjdV0g8Zb) writes The Discovery of a World in the Moone, in which he highlights the similarities of the Earth and the Moon (seas, mountains, atmosphere) and concludes that the Moon is likely to be inhabited by living beings, whom the calls <20>Selenites.<2E> (Maria Avxentevskaya, <20>[How 17th Century](https://www.google.com/url?q=https://www.realclearscience.com/articles/2017/12/02/how_17th_century_dreamers_planned_to_reach_the_moon_110476.html&sa=D&source=editors&ust=1674889727243765&usg=AOvVaw13_nH4qqo0LYqJqnhq4_eI)<29>[Dreamers Planned to Reach the Moon,](https://www.google.com/url?q=https://www.realclearscience.com/articles/2017/12/02/how_17th_century_dreamers_planned_to_reach_the_moon_110476.html&sa=D&source=editors&ust=1674889727244030&usg=AOvVaw2K5FMN315Pjxq_xO7wp7Ga)<29> <br/><br/>Real Clear Science, December 2, 2017) )";
//const char* p = u8R"(Pierre Lagrange, <20>[_Agobard, la Magonie et les ovnis_,](https://www.google.com/url?q=https://pierrelagrangesociologie.files.wordpress.com/2020/08/lagrange-agobard-magonie-ufologie-lhistoire-440-2017-10-p28-29.pdf&sa=D&source=editors&ust=1674889727239396&usg=AOvVaw1U01Ykx3tRTQS4QKENJuGi)<29> Actualit<69>, no. 440 (October 2017): 28<32>29; Wikipedia, <20>[Magonia (mythology)](https://www.google.com/url?q=https://en.wikipedia.org/wiki/Magonia_(mythology)&sa=D&source=editors&ust=1674889727239728&usg=AOvVaw0JOQanVKKoRClyKQPK5SJi)<29>))";
const char* p = "<br/>blah<br/>_[Agobard,](www.blah.com)_<br/> blah<br/>blah <br/>[_Agobard_,](www.blah.com)<br/>";
//const char* p = "***[sssss](www.dddd.com)*** _Blah_ *Cool*_Zeek_";
//const char* p = "P1\nP2 \nP3\n\nP4\nP5\nP6\n\nP7\nP8 **Blah** _[ZEEK](WWW.Z.COM)_";
uprintf("Original markdown: %s\n", p);
uprintf("----------\n");
markdown_text_processor mt;
mt.init_from_markdown(p);
uprintf("Internal text: %s\n", mt.m_text.c_str());
uprintf("----------\n");
std::string plain;
mt.convert_to_plain(plain, false);
uprintf("Plain text: %s\n", mt.m_text.c_str());
uprintf("----------\n");
#if 0
markdown_text_processor a, b;
bool status = mt.split_last_parens(a, b);
uprintf("status: %u\n", status);
std::string a_md, b_md;
a.convert_to_markdown(a_md);
b.convert_to_markdown(b_md);
uprintf("A:\n%s\n", a_md.c_str());
uprintf("----------\n");
uprintf("B:\n%s\n", b_md.c_str());
#endif
#if 1
std::string md;
mt.convert_to_markdown(md, false);
#if 0
uprintf("Plain text:\n");
for (uint32_t i = 0; i < mt.m_text.size(); i++)
{
char c = mt.m_text[i];
if (c == ' ')
uprintf(".");
else
uprintf("%c", (uint8_t)c);
}
uprintf("%s\n", mt.m_text.c_str());
uprintf("\n------\n");
#endif
uprintf("Converted back to markdown:\n");
#if 0
for (uint32_t i = 0; i < md.size(); i++)
{
char c = md[i];
if (c == ' ')
uprintf(".");
else
uprintf("%c", (uint8_t)c);
}
#endif
uprintf("\n%s\n", md.c_str());
uprintf("\n------\n");
#endif
exit(0);
#endif
}
enum
{
cMonthFlag = 1,
cBeginToLateFlag = 2,
cSeasonFlag = 4,
cApproxFlag = 8,
cOrFlag = 16,
cDashFlag = 32,
cXXFlag = 64,
cFuzzyFlag = 128,
cSlashFlag = 256
};
static const struct
{
const char* m_pStr;
uint32_t m_flag;
uint32_t m_month;
date_prefix_t m_date_prefix;
} g_special_phrases[] =
{
{ "january", cMonthFlag, 1 },
{ "february", cMonthFlag, 2 },
{ "march", cMonthFlag, 3 },
{ "april", cMonthFlag, 4 },
{ "may", cMonthFlag, 5 },
{ "june", cMonthFlag, 6 },
{ "july", cMonthFlag, 7 },
{ "august", cMonthFlag, 8 },
{ "september", cMonthFlag, 9 },
{ "october", cMonthFlag, 10 },
{ "november", cMonthFlag, 11 },
{ "december", cMonthFlag, 12 },
{ "jan.", cMonthFlag, 1 },
{ "feb.", cMonthFlag, 2 },
{ "mar.", cMonthFlag, 3 },
{ "apr.", cMonthFlag, 4 },
{ "may.", cMonthFlag, 5 },
{ "jun.", cMonthFlag, 6 },
{ "jul.", cMonthFlag, 7 },
{ "aug.", cMonthFlag, 8 },
{ "sep.", cMonthFlag, 9 },
{ "oct.", cMonthFlag, 10 },
{ "nov.", cMonthFlag, 11 },
{ "dec.", cMonthFlag, 12 },
{ "late", cBeginToLateFlag, 0, cLate },
{ "early", cBeginToLateFlag, 0, cEarly },
{ "middle", cBeginToLateFlag, 0, cMiddleOf },
{ "end", cBeginToLateFlag, 0, cEndOf },
{ "spring", cSeasonFlag, 0, cSpring },
{ "summer", cSeasonFlag, 0, cSummer },
{ "autumn", cSeasonFlag, 0, cAutumn },
{ "fall", cSeasonFlag, 0, cFall },
{ "winter", cSeasonFlag, 0, cWinter },
{ "wint", cSeasonFlag, 0, cWinter },
{ "approx", cApproxFlag },
{ "jan", cMonthFlag, 1 },
{ "feb", cMonthFlag, 2 },
{ "mar", cMonthFlag, 3 },
{ "apr", cMonthFlag, 4 },
{ "may", cMonthFlag, 5 },
{ "jun", cMonthFlag, 6 },
{ "jul", cMonthFlag, 7 },
{ "aug", cMonthFlag, 8 },
{ "sep", cMonthFlag, 9 },
{ "oct", cMonthFlag, 10 },
{ "nov", cMonthFlag, 11 },
{ "dec", cMonthFlag, 12 },
{ "mid", cBeginToLateFlag, 0, cMiddleOf },
{ "or", cOrFlag },
{ "xx", cXXFlag },
{ "-", cDashFlag },
{ "?", cFuzzyFlag },
{ "/", cSlashFlag }
};
const uint32_t NUM_SPECIAL_PHRASES = sizeof(g_special_phrases) / sizeof(g_special_phrases[0]);
enum
{
cSpecialJan,
cSpecialFeb,
cSpecialMar,
cSpecialApr,
cSpecialMay,
cSpecialJun,
cSpecialJul,
cSpecialAug,
cSpecialSep,
cSpecialOct,
cSpecialNov,
cSpecialDec,
cSpecialJan2,
cSpecialFeb2,
cSpecialMar2,
cSpecialApr2,
cSpecialMay2,
cSpecialJun2,
cSpecialJul2,
cSpecialAug2,
cSpecialSep2,
cSpecialOct2,
cSpecialNov2,
cSpecialDec2,
cSpecialLate,
cSpecialEarly,
cSpecialMiddle,
cSpecialEnd,
cSpecialSpring,
cSpecialSummer,
cSpecialAutumn,
cSpecialFall,
cSpecialWinter,
cSpecialWinter2,
cSpecialApprox,
cSpecialJan3,
cSpecialFeb3,
cSpecialMar3,
cSpecialApr3,
cSpecialMay3,
cSpecialJun3,
cSpecialJul3,
cSpecialAug3,
cSpecialSep3,
cSpecialOct3,
cSpecialNov3,
cSpecialDec3,
cSpecialMid,
cSpecialOr,
cSpecialXX,
cSpecialDash,
cSpecialFuzzy,
cSpecialSlash,
cSpecialTotal
};
static int get_special_from_token(int64_t tok)
{
if (tok >= 0)
panic("Invalid token");
int64_t spec = -tok - 1;
if (spec >= cSpecialTotal)
panic("Invalid token");
return (int)spec;
}
static bool convert_nipcap_date(std::string date, event_date& begin_date, event_date& end_date, event_date& alt_date)
{
assert(cSpecialTotal == NUM_SPECIAL_PHRASES);
const uint32_t MIN_YEAR = 1860;
const uint32_t MAX_YEAR = 2012;
string_trim(date);
bool is_all_digits = true;
for (char& c : date)
{
if (c < 0)
return false;
c = (char)tolower((uint8_t)c);
if (!isdigit(c))
is_all_digits = false;
}
if (is_all_digits)
{
// Handle most common, simplest cases.
if (date.size() == 6)
{
// YYMMDD
int year = convert_hex_digit(date[0]) * 10 + convert_hex_digit(date[1]);
int month = convert_hex_digit(date[2]) * 10 + convert_hex_digit(date[3]);
int day = convert_hex_digit(date[4]) * 10 + convert_hex_digit(date[5]);
if (year <= 8)
year += 2000;
else
year += 1900;
if (month > 12)
return false;
if (day > 31)
return false;
begin_date.m_year = year;
if (month != 0)
{
begin_date.m_month = month ? month : -1;
if (day != 0)
begin_date.m_day = day;
}
else
{
if (day != 0)
return false;
}
return true;
}
else if (date.size() == 8)
{
// YYYYMMDD
int year = convert_hex_digit(date[0]) * 1000 + convert_hex_digit(date[1]) * 100 + convert_hex_digit(date[2]) * 10 + convert_hex_digit(date[3]);
int month = convert_hex_digit(date[4]) * 10 + convert_hex_digit(date[5]);
int day = convert_hex_digit(date[6]) * 10 + convert_hex_digit(date[7]);
if ((year < MIN_YEAR) || (year > MAX_YEAR))
return false;
if (month > 12)
return false;
if (day > 31)
return false;
if (month == 0)
{
if (day != 0)
return false;
begin_date.m_year = year;
}
else if (day == 0)
{
begin_date.m_year = year;
begin_date.m_month = month ? month : -1;
}
else
{
begin_date.m_year = year;
begin_date.m_month = month ? month : -1;
begin_date.m_day = day;
}
return true;
}
else
return false;
}
// Tokenize the input then only parse those cases we explictly support. Everything else is an error.
std::vector<int64_t> tokens;
std::vector<int> digits;
uint32_t special_flags = 0, cur_ofs = 0;
while (cur_ofs < date.size())
{
if (isdigit(date[cur_ofs]))
{
int64_t val = 0;
int num_digits = 0;
do
{
if (!isdigit(date[cur_ofs]))
break;
val = val * 10 + convert_hex_digit(date[cur_ofs++]);
if (val < 0)
return false;
num_digits++;
} while (cur_ofs < date.size());
tokens.push_back(val);
digits.push_back(num_digits);
}
else if (date[cur_ofs] == ' ')
{
cur_ofs++;
}
else
{
std::string cur_str(date.c_str() + cur_ofs);
int phrase_index;
for (phrase_index = 0; phrase_index < NUM_SPECIAL_PHRASES; phrase_index++)
if (string_begins_with(cur_str, g_special_phrases[phrase_index].m_pStr))
break;
if (phrase_index == NUM_SPECIAL_PHRASES)
return false;
tokens.push_back(-(phrase_index + 1));
digits.push_back(0);
cur_ofs += (uint32_t)strlen(g_special_phrases[phrase_index].m_pStr);
special_flags |= g_special_phrases[phrase_index].m_flag;
}
}
assert(tokens.size() == digits.size());
// Just not supporting slashes in here.
if (special_flags & cSlashFlag)
return false;
if (!tokens.size())
return false;
// First token must be a number
if ((digits[0] != 2) && (digits[0] != 4) && (digits[0] != 6) && (digits[0] != 8))
return false;
if (special_flags & cSeasonFlag)
{
// Either YYSeason or YYYYSeason
if (tokens.size() != 2)
return false;
int year = 0;
if (digits[0] == 2)
year = (int)tokens[0] + 1900;
else if (digits[0] == 4)
{
year = (int)tokens[0];
if ((year < MIN_YEAR) || (year > MAX_YEAR))
return false;
}
else
return false;
begin_date.m_year = year;
if (tokens[1] >= 0)
return false;
int64_t special_index = -tokens[1] - 1;
if (special_index >= NUM_SPECIAL_PHRASES)
return false;
begin_date.m_prefix = g_special_phrases[special_index].m_date_prefix;
return true;
}
else if (special_flags & cMonthFlag)
{
// Not supporting explicit month
return false;
}
// No explicit season and no month - handle XX's
if ((tokens.size() == 2) && (tokens[1] < 0) && (get_special_from_token(tokens[1]) == cSpecialXX))
{
if (digits[0] == 4)
{
// YYMMXX
int year = 1900 + (int)(tokens[0] / 100);
int month = (int)(tokens[0] % 100);
if (month > 12)
return false;
begin_date.m_year = year;
begin_date.m_month = month ? month : -1;
}
else if (digits[0] == 6)
{
// YYYYMMXX
int year = (int)(tokens[0] / 100);
if ((year < MIN_YEAR) || (year > MAX_YEAR))
return false;
int month = (int)(tokens[0] % 100);
if (month > 12)
return false;
begin_date.m_year = year;
begin_date.m_month = month ? month : -1;
}
else
{
return false;
}
return true;
}
else if ((tokens.size() == 3) && (tokens[1] < 0) && (tokens[2] < 0) && (get_special_from_token(tokens[1]) == cSpecialXX) && (get_special_from_token(tokens[2]) == cSpecialXX))
{
if (digits[0] == 2)
{
// YYXXXX
begin_date.m_year = (int)tokens[0] + 1900;
}
else if (digits[0] == 4)
{
// YYYYXXXX
begin_date.m_year = (int)tokens[0];
if ((begin_date.m_year < MIN_YEAR) || (begin_date.m_year > MAX_YEAR))
return false;
}
else
return false;
return true;
}
if (special_flags & cXXFlag)
return false;
if (digits[0] == 2)
{
// YY
begin_date.m_year = (int)tokens[0] + 1900;
}
else if (digits[0] == 4)
{
// YYMM
begin_date.m_year = (int)(tokens[0] / 100) + 1900;
begin_date.m_month = (int)(tokens[0] % 100);
if (begin_date.m_month > 12)
return false;
else if (!begin_date.m_month)
begin_date.m_month = -1;
}
else if (digits[0] == 6)
{
// YYMMDD
begin_date.m_year = (int)(tokens[0] / 10000) + 1900;
begin_date.m_month = (int)((tokens[0] / 100) % 100);
if (begin_date.m_month > 12)
return false;
else if (!begin_date.m_month)
begin_date.m_month = -1;
begin_date.m_day = (int)(tokens[0] % 100);
if (begin_date.m_day >= 31)
return false;
if ((begin_date.m_month == -1) && (begin_date.m_day))
return false;
}
else if (digits[0] == 8)
{
// YYYYMMDD
begin_date.m_year = (int)(tokens[0] / 10000);
if ((begin_date.m_year < MIN_YEAR) || (begin_date.m_year > MAX_YEAR))
return false;
begin_date.m_month = (int)((tokens[0] / 100) % 100);
if (begin_date.m_month > 12)
return false;
else if (!begin_date.m_month)
begin_date.m_month = -1;
begin_date.m_day = (int)(tokens[0] % 100);
if (begin_date.m_day >= 31)
return false;
if ((begin_date.m_month == -1) && (begin_date.m_day))
return false;
}
else
{
return false;
}
if ((tokens.size() == 2) && (tokens[1] < 0) &&
((get_special_from_token(tokens[1]) >= cSpecialLate) && (get_special_from_token(tokens[1]) <= cSpecialEnd) ||
(get_special_from_token(tokens[1]) == cSpecialMid))
)
{
// 2 tokens, ends in "late", "middle", "early" etc.
if (begin_date.m_day != -1)
return false;
if (tokens[1] >= 0)
return false;
int64_t special_index = -tokens[1] - 1;
if (special_index >= NUM_SPECIAL_PHRASES)
return false;
begin_date.m_prefix = g_special_phrases[special_index].m_date_prefix;
return true;
}
if (special_flags & cBeginToLateFlag)
return false;
if ((tokens.size() == 3) && (tokens[1] < 0) && (get_special_from_token(tokens[1]) == cSpecialDash) && (tokens[2] >= 0))
{
if ((digits[0] == 6) && (digits[2] == 2))
{
// YYMMDD-DD
end_date = begin_date;
if (tokens[2] > 31)
return false;
end_date.m_day = (int)tokens[2];
return true;
}
else if ((digits[0] == 4) && (digits[2] == 4))
{
// YYMM-YYMM
end_date.m_year = (int)(tokens[2] / 100) + 1900;
end_date.m_month = (int)(tokens[2] % 100);
if (end_date.m_month > 12)
return false;
else if (!end_date.m_month)
end_date.m_month = -1;
return true;
}
else if ((digits[0] == 6) && (digits[2] == 6))
{
// YYMMDD-YYMMDD
end_date.m_year = (int)(tokens[2] / 10000) + 1900;
end_date.m_month = (int)((tokens[2] / 100) % 100);
if (end_date.m_month > 12)
return false;
else if (!end_date.m_month)
end_date.m_month = -1;
end_date.m_day = (int)(tokens[2] % 100);
if (end_date.m_day >= 31)
return false;
return true;
}
else if ((digits[0] == 8) && (digits[2] == 8))
{
// YYYYMMDD-YYYYMMDD
end_date.m_year = (int)(tokens[2] / 10000);
if ((end_date.m_year < MIN_YEAR) || (end_date.m_year > MAX_YEAR))
return false;
end_date.m_month = (int)((tokens[2] / 100) % 100);
if (end_date.m_month > 12)
return false;
else if (!end_date.m_month)
end_date.m_month = -1;
end_date.m_day = (int)(tokens[2] % 100);
if (end_date.m_day >= 31)
return false;
if ((end_date.m_month == -1) && (end_date.m_day))
return false;
return true;
}
else
{
return false;
}
}
if (special_flags & cDashFlag)
return false;
if ((tokens.size() == 2) && (get_special_from_token(tokens[1]) == cSpecialFuzzy))
{
begin_date.m_fuzzy = true;
return true;
}
if (special_flags & cFuzzyFlag)
return false;
if ((tokens.size() == 3) && (get_special_from_token(tokens[1]) == cSpecialOr))
{
if ((digits[0] == 2) && (digits[2] == 2))
{
// YY or YY
alt_date.m_year = (int)tokens[2] + 1900;
return true;
}
else if ((digits[0] == 4) && (digits[2] == 4))
{
// YYMM or YYMM
alt_date.m_year = (int)(tokens[2] / 100) + 1900;
alt_date.m_month = (int)(tokens[2] % 100);
if (alt_date.m_month > 12)
return false;
else if (!alt_date.m_month)
alt_date.m_month = -1;
return true;
}
else if ((digits[0] == 6) && (digits[2] == 2))
{
// YYMMDD or DD
alt_date = begin_date;
if (tokens[2] > 31)
return false;
alt_date.m_day = (int)tokens[2];
return true;
}
else if ((digits[0] == 6) && (digits[2] == 6))
{
// YYMMDD or YYMMDD
alt_date.m_year = (int)(tokens[2] / 10000) + 1900;
alt_date.m_month = (int)((tokens[2] / 100) % 100);
if (alt_date.m_month > 12)
return false;
else if (!alt_date.m_month)
return false;
alt_date.m_day = (int)(tokens[2] % 100);
if (alt_date.m_day >= 31)
return false;
return true;
}
else
{
return false;
}
}
if (special_flags & cOrFlag)
return false;
if ((tokens.size() == 2) && ((tokens[1] < 0) && (get_special_from_token(tokens[1]) == cSpecialApprox)))
{
begin_date.m_approx = true;
return true;
}
if (special_flags & cApproxFlag)
return false;
if (tokens.size() > 1)
return false;
return true;
}
const int NICAP_FIRST_CAT = 1;
const int NICAP_LAST_CAT = 11;
static const char* g_nicap_categories[11] =
{
"01 - Distant Encounters",
"02 - Close Encounters",
"03 - EME Cases",
"04 - Animal Reactions",
"05 - Medical Incidents",
"06 - Trace Cases",
"07 - Entity Cases",
"08 - Photographic Cases",
"09 - RADAR Cases",
"10 - Nuclear Connection",
"11 - Aviation Cases"
};
static const char* g_nicap_archive_urls[] =
{
"040228USS%5FSupplydir.htm",
"421019guadalcanaldir.htm",
"4409xxoakridge%5Fdir.htm",
"470116northseadir.htm",
"470117northseadir.htm",
"470828japandir.htm",
"490723delphidir.htm",
"500110lasvegasdir.htm",
"500227coultervilledir.htm",
"500309selfridgedir.htm",
"500322%5F4925dir.htm",
"500529mtvernondir.htm",
"510709dir.htm",
"520425dir.htm",
"520501georgedir.htm",
"520512roswelldir.htm",
"520526koreadir.htm",
"520619goosedir.htm",
"520719wnsdir.htm",
"520723jamestown%5F01%5Fdir.htm",
"520723jamestown%5F02%5Fdir.htm",
"520729dir.htm",
"520729langley%5Fdir.htm",
"5207xxdir.htm",
"520807sanantondir.htm",
"530213carswelldir.htm",
"530416eprairiedir.htm",
"530618iwodir.htm",
"531016presqudir.htm",
"531228marysvilledir.htm",
"571102level%5Fdir.htm",
"571106b.htm",
"571110dir.htm",
"580505dir.htm",
"640629dir.htm",
"640904dir.htm",
"650916dir.htm",
"660314dir.htm",
"730202dir.htm",
"731120mtvernon%5Fdir.htm",
"770701avianodir.htm",
"7710xxfostoriadir.htm",
"801227rendledir.htm",
"821004ukraine%5Fdir.htm",
"861117alaskadir.htm",
"870823dir.htm",
"aca%5F540621.htm",
"adickesdir.htm",
"alamo670302dir.htm",
"alamos500225Bdir.htm",
"alamos520722dir.htm",
"albany520226dir.htm",
"albuq490106dir.htm",
"albuq510825dir.htm",
"albuq520510dir.htm",
"albuq520528dir.htm",
"ar-570730dir.htm",
"ar-641030dir.htm",
"ar-770126.htm",
"ar-hubbarddir.htm",
"arlington470707dir.htm",
"ashley480408dir.htm",
"balt520721dir.htm",
"barradir.htm",
"benson520403dir.htm",
"bermuda490124dir.htm",
"blackstone480724docs1.htm",
"bonlee501023dir.htm",
"candir.htm",
"canogapark571111dir.htm",
"cashlandir.htm",
"cavecreek471014dir.htm",
"chamble480726docs1.htm",
"colosprings520709dir.htm",
"columbus520618dir.htm",
"compass.htm",
"contrexeville761214dir.htm",
"cortez490127dir.htm",
"costarica1dir.htm",
"coynedir.htm",
"daggett.htm",
"dayton471020dir.htm",
"dayton520712dir.htm",
"dillon490403dir.htm",
"dmonthan490516dir.htm",
"dodgeville4710xxdir.htm",
"elko490502dir.htm",
"ellsworth.htm",
"en-590626dir.htm",
"f-86dir.htm",
"fairchild520120dir.htm",
"fayette491228dir.htm",
"fortdixdir.htm",
"france57.htm",
"ftbliss490519dir.htm",
"ftmyers501206dir.htm",
"ftrich470917dir.htm",
"ftsumner470710dir.htm",
"fulda520602dir.htm",
"george520501dir.htm",
"glenburnie520329dir.htm",
"gormandir.htm",
"greenville480419dir.htm",
"harmondir.htm",
"hawaii.htm",
"hawesville870722dir.htm",
"hickam490104dir.htm",
"holloman491213dir.htm",
"hood490427dir.htm",
"houston520520dir.htm",
"hynekrv6.htm",
"indy480731dir.htm",
"keesler520507dir.htm",
"keywest501114dir.htm",
"kirtland500321dir.htm",
"kirtland800809dir.htm",
"knoxville500301dir.htm",
"lmeade520402dir.htm",
"longmead520417dir.htm",
"louisville520615dir.htm",
"lubbock520802dir.htm",
"lukedir.htm",
"lynn520701dir.htm",
"mainbrace.htm",
"malmstrom75dir.htm",
"maple480829dir.htm",
"marshall520429dir.htm",
"mcchord520617dir.htm",
"meromdir.htm",
"minn.htm",
"minot681028dir.htm",
"mobile520722dir.htm",
"mongodir.htm",
"monroe480528dir.htm",
"moses520501dir.htm",
"mtvernon.htm",
"mufon%5Farg0809%5F83.htm",
"mufon%5Fau0513%5F83.htm",
"mufon%5Fau0520%5F83.htm",
"mufon%5Fau0722%5F83.htm",
"mufon%5Fbo0302%5F83.htm",
"mufon%5Fbr0612%5F83.htm",
"mufon%5Fgb0119%5F83.htm",
"mufon%5Fgb0315%5F83.htm",
"mufon%5Fgb0812%5F83.htm",
"mufon%5Fmx0000%5F83.htm",
"mufon%5Fru0714%5F83.htm",
"mufon%5Fru0826%5F83.htm",
"musk5307XXdir.htm",
"naha520422dir.htm",
"nahant520723dir.htm",
"natcity520513dir.htm",
"ncp-oakridge501013.htm",
"nederland660206dir.htm",
"nellisdir.htm",
"nkorea520526dir.htm",
"norwooddir.htm",
"oakridge501105dir.htm",
"oakridge511207dir.htm",
"ontario520412dir.htm",
"osceola520725dir.htm",
"paintsville020114dir.htm",
"palomar491014dir.htm",
"palomar491125dir.htm",
"phoenix520405dir.htm",
"phoenix520808dir.htm",
"placerville470814dir.htm",
"plymouth500427dir.htm",
"pontiac520427dir.htm",
"portagedir.htm",
"portland470704dir.htm",
"pottsdown520723dir.htm",
"ramore530630dir.htm",
"rd-750503dir.htm",
"reno490905dir.htm",
"reseda570328dir.htm",
"rockville520722dir.htm",
"rogue490524dir.htm",
"roseville520427dir.htm",
"roswell520217dir.htm",
"russia551014dir.htm",
"sanacacia480717dir.htm",
"sandiego521217dir.htm",
"sands480405.htm",
"sands490424dir.htm",
"sanmarcos520721dir.htm",
"savannah520510dir.htm",
"selfridge511124dir.htm",
"sharondir.htm",
"shreve520416dir.htm",
"springer490425dir.htm",
"stmaries470703dir.htm",
"sts80.htm",
"tehrandir.htm",
"temple520406dir.htm",
"terredir.htm",
"tucson490428dir.htm",
"ubatubadir.htm",
"vaughn47latedir.htm",
"vaughn481103dir.htm",
"vincennes.htm",
"walnutlake520525dir.htm",
"washington520713dir.htm",
"wsands2dir.htm",
"yuma520417dir.htm",
"yuma520427dir.htm",
"zamoradir.htm",
"charleston1980.htm"
};
const uint32_t NUM_NICAP_ARCHIVE_URLS = sizeof(g_nicap_archive_urls) / sizeof(g_nicap_archive_urls[0]);
bool convert_nicap(unordered_string_set& unique_urls)
{
string_vec lines;
bool utf8_flag = false;
if (!read_text_file("nicap1_150.md", lines, true, &utf8_flag))
panic("Failed reading input file");
if (!utf8_flag)
panic("Expecting utf8 data");
if (!read_text_file("nicap151_334.md", lines, true, &utf8_flag))
panic("Failed reading input file");
if (!utf8_flag)
panic("Expecting utf8 data");
string_vec new_lines;
for (std::string& str : lines)
if (str.size())
new_lines.push_back(str);
lines.swap(new_lines);
if (!lines.size())
panic("File empty");
json js_doc = json::object();
js_doc["NICAP Data"] = json::array();
auto& js_doc_array = js_doc["NICAP Data"];
std::string prev_date, prev_orig_desc;
uint32_t num_repeated_recs = 0;
uint32_t record_index = 0;
uint32_t cur_line_index = 0;
while (cur_line_index < lines.size())
{
std::string date(lines[cur_line_index]);
if (!date.size())
panic("Invalid date");
cur_line_index++;
std::string ref;
if (date.front() == '[')
{
ref = date;
markdown_text_processor mtp;
mtp.init_from_markdown(ref.c_str());
mtp.fix_redirect_urls();
if (mtp.m_links.size())
{
for (uint32_t i = 0; i < mtp.m_links.size(); i++)
{
std::string& s = mtp.m_links[i];
uint32_t j;
for (j = 0; j < NUM_NICAP_ARCHIVE_URLS; j++)
if (string_find_first(s, g_nicap_archive_urls[j]) != -1)
break;
if (j < NUM_NICAP_ARCHIVE_URLS)
mtp.m_links[i] = "https://web.archive.org/web/100/" + mtp.m_links[i];
}
}
for (uint32_t i = 0; i < mtp.m_links.size(); i++)
unique_urls.insert(mtp.m_links[i]);
for (auto& str : mtp.m_links)
str = fix_bar_urls(str);
ref.clear();
mtp.convert_to_markdown(ref, true);
// 6+2+2+1 chars
if (date.size() < 11)
panic("Invalid date");
date.erase(0, 1);
uint32_t i;
bool found_end_bracket = false;
for (i = 0; i < date.size(); i++)
{
if (date[i] == ']')
{
date.erase(i, date.size() - i);
found_end_bracket = true;
break;
}
}
if (!found_end_bracket)
panic("Invalid date");
}
event_date begin_date, end_date, alt_date;
bool status = convert_nipcap_date(date, begin_date, end_date, alt_date);
if (!status)
panic("Failed converting NICAP date");
// Get first line of city
if (cur_line_index == lines.size())
panic("Out of lines");
std::string city(lines[cur_line_index++]);
if (!city.size() || (city == "* * *"))
panic("Expected city");
if (cur_line_index == lines.size())
panic("Failed parsing NICAP data");
// Look ahead for category, 1-11
int cat_index = -1;
int cat_line_index = -1;
for (uint32_t i = 0; i < 5; i++)
{
if ((cur_line_index + i) >= lines.size())
break;
const std::string& s = lines[cur_line_index + i];
if ((s.size() <= 2) && (string_is_digits(s)))
{
int val = atoi(s.c_str());
if ((val >= NICAP_FIRST_CAT) && (val <= NICAP_LAST_CAT))
{
cat_index = val;
cat_line_index = cur_line_index + i;
break;
}
}
}
if (cat_index == -1)
panic("Can't find category");
for (int i = cur_line_index; i < (cat_line_index - 1); i++)
city = combine_strings(city, lines[i]);
// Get state or country, which is right before the category
std::string state_or_country(lines[cat_line_index - 1]);
if (!state_or_country.size() || (state_or_country == "* * * "))
panic("Expected state or country");
if (cur_line_index == lines.size())
panic("Failed parsing NICAP data");
cur_line_index = cat_line_index + 1;
string_vec rec;
while (cur_line_index < lines.size())
{
const std::string& str = lines[cur_line_index];
if (str == "* * *")
{
cur_line_index++;
break;
}
else if (str.size() && str[0] == '[')
break;
if ((str.size() >= 2) && isdigit((uint8_t)str[0]) && isdigit((uint8_t)str[1]))
{
event_date next_begin_date, next_end_date, next_alt_date;
bool spec_status = convert_nipcap_date(str, next_begin_date, next_end_date, next_alt_date);
if (spec_status)
break;
}
rec.push_back(str);
cur_line_index++;
}
if (!rec.size())
panic("Failed parsing record");
std::string code, bb;
int rating = -1;
bool nc_flag = false, lc_flag = false;
for (int i = 0; i < std::min(5, (int)rec.size()); i++)
{
if (rec[i] == "NC")
{
nc_flag = true;
rec.erase(rec.begin() + i);
break;
}
}
for (int i = 0; i < std::min(5, (int)rec.size()); i++)
{
if (rec[i] == "LC")
{
lc_flag = true;
rec.erase(rec.begin() + i);
break;
}
}
if (!rec.size())
panic("Invalid record");
if (string_is_alpha(rec[0]) && (rec[0].size() <= 2))
{
if ((rec[0] == "A") || (rec[0] == "R") || (rec[0] == "En") || (rec[0] == "An") || (rec[0] == "C") ||
(rec[0] == "E") || (rec[0] == "D") || (rec[0] == "M") || (rec[0] == "I") || (rec[0] == "U") || (rec[0] == "N") || (rec[0] == "T") || (rec[0] == "aM"))
{
code = rec[0];
rec.erase(rec.begin());
if (!rec.size())
panic("Invalid record");
}
else
{
panic("Unknown code");
}
}
int i, j = std::min(5, (int)rec.size());
for (i = 0; i < j; i++)
if ((rec[i] == "BBU") || (rec[i] == "BB"))
break;
bool found_bbu = false;
if (i < j)
{
bb = rec[i];
rec.erase(rec.begin() + i);
if (!rec.size())
panic("Invalid record");
found_bbu = true;
}
if (string_is_digits(rec[0]) && (rec[0].size() == 1))
{
int val = atoi(rec[0].c_str());
if ((val < 0) || (val > 5))
panic("Invalid rating");
rating = val;
rec.erase(rec.begin());
if (!rec.size())
panic("Invalid record");
}
if (!found_bbu && string_is_digits(rec[0]) && (rec[0].size() >= 2))
{
int val = atoi(rec[0].c_str());
if ((val >= 12) && (val <= 12607))
{
bb = rec[0];
rec.erase(rec.begin());
if (!rec.size())
panic("Invalid record");
}
else
{
panic("Bad BB #");
}
}
json js;
js["date"] = begin_date.get_string();
if (end_date.is_valid())
js["end_date"] = end_date.get_string();
if (alt_date.is_valid())
js["alt_date"] = alt_date.get_string();
js["ref"] = json::array();
if (ref.size())
js["ref"].push_back("NICAP: " + ref);
js["ref"].push_back("[NICAP Listing by Date PDF](http://www.nicap.org/NSID/NSID_DBListingbyDate.pdf)");
if (state_or_country.size())
js["location"] = city + ", " + state_or_country;
else
js["location"] = city;
std::string desc;
for (const std::string& s : rec)
desc = combine_strings(desc, s);
assert(cat_index >= 1 && cat_index <= 11);
const std::string orig_desc(desc);
desc = desc + string_format(" (NICAP: %s", g_nicap_categories[cat_index - 1]);
if (code.size())
desc = desc + string_format(", Code: %s", code.c_str());
if (rating != -1)
desc = desc + string_format(", Rating: %i", rating);
if (bb.size())
desc = desc + string_format(", BB: %s", bb.c_str());
if (nc_flag)
desc = desc + string_format(", NC");
if (lc_flag)
desc = desc + string_format(", LC");
desc += ")";
if ((desc[0] >= 0) && islower((uint8_t)desc[0]))
desc[0] = (char)toupper((uint8_t)desc[0]);
js["desc"] = desc;
js["source"] = "NICAP_DB";
js["source_id"] = "NICAP_DB" + string_format("_%u", record_index);
if ((prev_orig_desc.size()) && (orig_desc == prev_orig_desc) && (js["date"] == prev_date))
{
// It's a repeated record, with just a different category.
std::string new_desc(js_doc_array.back()["desc"]);
new_desc += string_format(" (NICAP: %s)", g_nicap_categories[cat_index - 1]);
js_doc_array.back()["desc"] = new_desc;
num_repeated_recs++;
}
else
{
prev_date = js["date"];
prev_orig_desc = orig_desc;
js_doc_array.push_back(js);
}
#if 0
uprintf("Record: %u\n", record_index);
uprintf("Date string: %s\n", date.c_str());
uprintf("Date: %s\n", begin_date.get_string().c_str());
if (end_date.is_valid())
uprintf("End date: %s\n", end_date.get_string().c_str());
if (alt_date.is_valid())
uprintf("Alt date: %s\n", alt_date.get_string().c_str());
if (ref.size())
uprintf("Ref: %s\n", ref.c_str());
uprintf("City: %s\n", city.c_str());
uprintf("State or County: %s\n", state_or_country.c_str());
uprintf("Category: %i\n", cat_index);
if (code.size())
uprintf("NICAP code: %s\n", code.c_str());
if (rating != -1)
uprintf("NICAP Rating: %u\n", rating);
if (bb.size())
uprintf("NICAP BB: %s\n", bb.c_str());
if (nc_flag || lc_flag)
uprintf("NICAP nc_flag: %u lc_flag: %u\n", nc_flag, lc_flag);
uprintf("Record:\n");
for (const std::string& s : rec)
uprintf("%s\n", s.c_str());
uprintf("\n");
#endif
record_index++;
}
if (!serialize_to_json_file("nicap_db.json", js_doc, true))
panic("Failed serializing JSON file");
uprintf("Processed %u records, skipping %u repeated records\n", record_index, num_repeated_recs);
return true;
}
void converters_init()
{
converters_test();
}