ufo_data/utils.cpp
2023-10-05 14:07:39 -04:00

1726 lines
44 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// utils.cpp
// Copyright (C) 2023 Richard Geldreich, Jr.
#include "utils.h"
#include "utf8.h"
#include "stem.h"
std::string combine_strings(std::string a, const std::string& b)
{
if (!a.size())
return b;
if (!b.size())
return a;
if (a.back() == '-')
{
if ((a.size() >= 2) && isdigit((uint8_t)a[a.size() - 2]))
{
}
else
{
a.pop_back();
a += b;
}
}
else
{
if (a.back() != ' ')
a += " ";
a += b;
}
return a;
}
std::wstring utf8_to_wchar(const std::string& str, UINT code_page)
{
if (str.empty())
return std::wstring();
int size_needed = MultiByteToWideChar(code_page, 0, &str[0], (int)str.size(), NULL, 0);
if (!size_needed)
return std::wstring();
std::wstring wstrTo(size_needed, 0);
int res = MultiByteToWideChar(code_page, 0, &str[0], (int)str.size(), &wstrTo[0], size_needed);
if (!res)
return std::wstring();
return wstrTo;
}
std::string wchar_to_utf8(const std::wstring& wstr, UINT code_page)
{
if (wstr.empty())
return std::string();
int size_needed = WideCharToMultiByte(code_page, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
if (!size_needed)
return std::string();
std::string strTo(size_needed, 0);
int res = WideCharToMultiByte(code_page, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
if (!res)
return std::string();
return strTo;
}
static uint16_t g_codepage_437_to_unicode_0_31[32] =
{
' ', 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022,
0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C,
0x25BA, 0x25C4, 0x2195, 0x203C, 0x00B6, 0x00A7, 0x25AC, 0x21A8,
0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC
};
static uint16_t g_codepage_437_to_unicode_128_255[129] =
{
0x2302,
0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192,
0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192,
0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,
0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F,
0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B,
0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4,
0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x03B5, 0x2229,
0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248,
0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0
};
// Code page 437 to utf8. WideCharToMultiByte etc. doesn't do the expecting thing for chars<32, and we need them.
std::string dos_to_utf8(const std::string& str)
{
std::wstring wstr;
for (uint8_t c : str)
{
if (c < 32)
wstr.push_back(g_codepage_437_to_unicode_0_31[c]);
else if (c >= 127)
wstr.push_back(g_codepage_437_to_unicode_128_255[c - 127]);
else
wstr.push_back(c);
}
return wchar_to_utf8(wstr);
}
bool vformat(std::vector<char>& buf, const char* pFmt, va_list args)
{
uint32_t buf_size = 8192;
for (; ; )
{
buf.resize(buf_size);
int res = vsnprintf(&buf[0], buf.size(), pFmt, args);
if (res == -1)
{
assert(false);
return false;
}
if (res <= buf.size() - 1)
break;
buf_size *= 2;
if (buf_size > 16 * 1024 * 1024)
{
assert(false);
return false;
}
}
return true;
}
void ufprintf(FILE* pFile, const char* pFmt, ...)
{
std::vector<char> buf;
va_list args;
va_start(args, pFmt);
if (!vformat(buf, pFmt, args))
return;
va_end(args);
std::wstring wbuf(utf8_to_wchar(std::string(&buf[0])));
// Not thread safe, but we don't care
_setmode(_fileno(pFile), _O_U16TEXT);
fputws(&wbuf[0], pFile);
_setmode(_fileno(pFile), _O_TEXT);
}
void uprintf(const char* pFmt, ...)
{
std::vector<char> buf;
va_list args;
va_start(args, pFmt);
if (!vformat(buf, pFmt, args))
return;
va_end(args);
std::wstring wbuf(utf8_to_wchar(std::string(&buf[0])));
// Not thread safe, but we don't care
_setmode(_fileno(stdout), _O_U16TEXT);
fputws(&wbuf[0], stdout);
_setmode(_fileno(stdout), _O_TEXT);
}
std::string string_format(const char* pMsg, ...)
{
std::vector<char> buf;
va_list args;
va_start(args, pMsg);
if (!vformat(buf, pMsg, args))
return "";
va_end(args);
std::string res;
if (buf.size())
res.assign(&buf[0]);
return res;
}
void panic(const char* pMsg, ...)
{
char buf[4096];
va_list args;
va_start(args, pMsg);
vsnprintf(buf, sizeof(buf), pMsg, args);
va_end(args);
ufprintf(stderr, "%s", buf);
exit(EXIT_FAILURE);
}
FILE* ufopen(const char* pFilename, const char* pMode)
{
std::wstring wfilename(utf8_to_wchar(pFilename));
std::wstring wmode(utf8_to_wchar(pMode));
if (!wfilename.size() || !wmode.size())
return nullptr;
FILE* pRes = nullptr;
_wfopen_s(&pRes, &wfilename[0], &wmode[0]);
return pRes;
}
std::string& string_trim(std::string& str)
{
while (str.size() && isspace((uint8_t)str.back()))
str.pop_back();
while (str.size() && isspace((uint8_t)str[0]))
str.erase(0, 1);
return str;
}
std::string& string_trim_end(std::string& str)
{
while (str.size() && isspace((uint8_t)str.back()))
str.pop_back();
return str;
}
// Case sensitive, returns -1 if can't find
int string_find_first(const std::string& str, const char* pPhrase)
{
size_t res = str.find(pPhrase, 0);
if (res == std::string::npos)
return -1;
return (int)res;
}
// Case insensitive, returns -1 if can't find
int string_ifind_first(const std::string& str, const char* pPhrase)
{
const size_t str_size = str.size();
const size_t phrase_size = strlen(pPhrase);
assert((int)str_size == str_size);
assert((int)phrase_size == phrase_size);
assert(phrase_size);
if ((!str_size) || (!phrase_size) || (phrase_size > str_size))
return -1;
const size_t end_ofs = str_size - phrase_size;
for (size_t ofs = 0; ofs <= end_ofs; ofs++)
{
assert(ofs + phrase_size <= str_size);
if (_strnicmp(str.c_str() + ofs, pPhrase, phrase_size) == 0)
return (int)ofs;
}
return -1;
}
int string_icompare(const std::string& a, const char* pB)
{
const size_t a_len = a.size();
const size_t b_len = strlen(pB);
const size_t min_len = std::min(a_len, b_len);
for (size_t i = 0; i < min_len; i++)
{
const int ac = (uint8_t)utolower(a[i]);
const int bc = (uint8_t)utolower(pB[i]);
if (ac != bc)
return (ac < bc) ? -1 : 1;
}
if (a_len == b_len)
return 0;
return (a_len < b_len) ? -1 : 1;
}
bool string_begins_with(const std::string& str, const char* pPhrase)
{
const size_t str_len = str.size();
const size_t phrase_len = strlen(pPhrase);
assert(phrase_len);
if (str_len >= phrase_len)
{
if (_strnicmp(pPhrase, str.c_str(), phrase_len) == 0)
return true;
}
return false;
}
bool string_ends_in(const std::string& str, const char* pPhrase)
{
const size_t str_len = str.size();
const size_t phrase_len = strlen(pPhrase);
assert(phrase_len);
if (str_len >= phrase_len)
{
if (_stricmp(pPhrase, str.c_str() + str_len - phrase_len) == 0)
return true;
}
return false;
}
std::string encode_url(const std::string& url)
{
//const char* pValid_chars = ";,/?:@&=+$-_.!~*'()#";
//const size_t valid_chars_len = strlen(pValid_chars);
std::string res;
for (uint32_t i = 0; i < url.size(); i++)
{
uint8_t c = (uint8_t)url[i];
//const bool is_digit = (c >= 0) && (c <= '9');
//const bool is_upper = (c >= 'A') && (c <= 'Z');
//const bool is_lower = (c >= 'a') && (c <= 'z');
// Escape some problematic charactes that confuse some Markdown parsers (even after using Markdown '\' escapes)
if ((c == ')') || (c == '(') || (c == '_') || (c == '*'))
{
res.push_back('%');
res.push_back(to_hex(c / 16));
res.push_back(to_hex(c % 16));
continue;
}
res.push_back(c);
}
return res;
}
// TODO
uint32_t crc32(const uint8_t* pBuf, size_t size, uint32_t init_crc)
{
uint32_t crc = ~init_crc;
for (size_t i = 0; i < size; i++)
{
const uint32_t byte = pBuf[i];
crc = crc ^ byte;
for (int j = 7; j >= 0; j--)
{
uint32_t mask = -((int)(crc & 1));
crc = (crc >> 1) ^ (0xEDB88320 & mask);
}
}
return ~crc;
}
uint32_t hash_hsieh(const uint8_t* pBuf, size_t len)
{
if (!pBuf || !len)
return 0;
uint32_t h = static_cast<uint32_t>(len);
const uint32_t bytes_left = len & 3;
len >>= 2;
while (len--)
{
const uint16_t* pWords = reinterpret_cast<const uint16_t*>(pBuf);
h += pWords[0];
const uint32_t t = (pWords[1] << 11) ^ h;
h = (h << 16) ^ t;
pBuf += sizeof(uint32_t);
h += h >> 11;
}
switch (bytes_left)
{
case 1:
h += *reinterpret_cast<const signed char*>(pBuf);
h ^= h << 10;
h += h >> 1;
break;
case 2:
h += *reinterpret_cast<const uint16_t*>(pBuf);
h ^= h << 11;
h += h >> 17;
break;
case 3:
h += *reinterpret_cast<const uint16_t*>(pBuf);
h ^= h << 16;
h ^= (static_cast<signed char>(pBuf[sizeof(uint16_t)])) << 18;
h += h >> 11;
break;
default:
break;
}
h ^= h << 3;
h += h >> 5;
h ^= h << 4;
h += h >> 17;
h ^= h << 25;
h += h >> 6;
return h;
}
bool read_binary_file(const char* pFilename, uint8_vec& buf)
{
const uint64_t MAX_BINARY_FILE_LEN = 168ULL * 1024ULL * (1024ULL * 1024ULL);
FILE* pFile = ufopen(pFilename, "rb");
if (!pFile)
return false;
_fseeki64(pFile, 0, SEEK_END);
int64_t len = _ftelli64(pFile);
if (len < 0)
{
fclose(pFile);
return false;
}
_fseeki64(pFile, 0, SEEK_SET);
if (len > MAX_BINARY_FILE_LEN)
return false;
buf.resize(len);
if (fread(&buf[0], len, 1, pFile) != 1)
{
fclose(pFile);
return false;
}
fclose(pFile);
return true;
}
bool read_text_file(const char* pFilename, string_vec& lines, bool trim_lines, bool* pUTF8_flag)
{
FILE* pFile = ufopen(pFilename, "r");
if (!pFile)
return false;
bool first_line = true;
if (pUTF8_flag)
*pUTF8_flag = false;
while (!feof(pFile))
{
char buf[16384];
char* p = fgets(buf, sizeof(buf), pFile);
if (!p)
{
if (feof(pFile))
break;
fclose(pFile);
return false;
}
std::string str(p);
if (first_line)
{
first_line = false;
if ((str.size() >= 3) &&
((uint8_t)str[0] == UTF8_BOM0) &&
((uint8_t)str[1] == UTF8_BOM1) &&
((uint8_t)str[2] == UTF8_BOM2))
{
if (pUTF8_flag)
*pUTF8_flag = true;
str.erase(0, 3);
}
}
while (str.size() && ((str.back() == '\n') || (str.back() == '\r')))
str.pop_back();
if (trim_lines)
string_trim_end(str);
lines.push_back(str);
}
fclose(pFile);
return true;
}
bool read_text_file(const char* pFilename, std::vector<uint8_t>& buf, bool *pUTF8_flag)
{
if (pUTF8_flag)
*pUTF8_flag = false;
FILE* pFile = ufopen(pFilename, "rb");
if (!pFile)
{
ufprintf(stderr, "Failed reading file %s!\n", pFilename);
return false;
}
fseek(pFile, 0, SEEK_END);
uint64_t filesize = _ftelli64(pFile);
fseek(pFile, 0, SEEK_SET);
buf.resize(filesize + 1);
fread(&buf[0], 1, filesize, pFile);
fclose(pFile);
if ((buf.size() >= 3) &&
((uint8_t)buf[0] == UTF8_BOM0) &&
((uint8_t)buf[1] == UTF8_BOM1) &&
((uint8_t)buf[2] == UTF8_BOM2))
{
if (pUTF8_flag)
*pUTF8_flag = true;
buf.erase(buf.begin(), buf.begin() + 3);
}
return true;
}
bool write_text_file(const char* pFilename, const string_vec& lines, bool utf8_bom)
{
FILE* pFile = ufopen(pFilename, "wb");
if (!pFile)
return false;
if (utf8_bom)
{
if ((fputc(UTF8_BOM0, pFile) == EOF) || (fputc(UTF8_BOM1, pFile) == EOF) || (fputc(UTF8_BOM2, pFile) == EOF))
{
fclose(pFile);
return false;
}
}
for (uint32_t i = 0; i < lines.size(); i++)
{
if (lines[i].size())
{
if (fwrite(lines[i].c_str(), lines[i].size(), 1, pFile) != 1)
{
fclose(pFile);
return false;
}
}
if (fwrite("\r\n", 2, 1, pFile) != 1)
{
fclose(pFile);
return false;
}
}
if (fclose(pFile) == EOF)
return false;
return true;
}
bool serialize_to_json_file(const char* pFilename, const json& j, bool utf8_bom)
{
FILE* pFile = ufopen(pFilename, "wb");
if (!pFile)
return false;
if (utf8_bom)
{
if ((fputc(UTF8_BOM0, pFile) == EOF) || (fputc(UTF8_BOM1, pFile) == EOF) || (fputc(UTF8_BOM2, pFile) == EOF))
{
fclose(pFile);
return false;
}
}
std::string d(j.dump(2));
if (d.size())
{
if (fwrite(&d[0], d.size(), 1, pFile) != 1)
{
fclose(pFile);
return false;
}
}
fclose(pFile);
return true;
}
// Note: This doesn't actually handle utf8. It assumes ANSI (code page 252) text input.
static std::string extract_column_text(const std::string& str, uint32_t ofs, uint32_t len)
{
if (ofs >= str.size())
return "";
const uint32_t max_len = std::min((uint32_t)str.size() - ofs, len);
std::string res(str);
if (ofs)
res.erase(0, ofs);
if (max_len < res.size())
res.erase(max_len, res.size());
string_trim(res);
return res;
}
// Note: This doesn't actually handle utf8. It assumes ANSI (code page 252) text input.
bool load_column_text(const char* pFilename, std::vector<string_vec>& rows, std::string& title, string_vec& col_titles, bool empty_line_seps, const char* pExtra_col_text)
{
string_vec lines;
bool utf8_flag = false;
if (!read_text_file(pFilename, lines, true, &utf8_flag))
panic("Failed reading text file %s", pFilename);
if (utf8_flag)
panic("load_column_text() doesn't support utf8 yet");
if (!lines.size() || !lines[0].size())
panic("Expected title");
if (lines.size() < 5)
panic("File too small");
for (uint32_t i = 0; i < lines.size(); i++)
{
if (lines[i].find_first_of(9) != std::string::npos)
panic("Tab in file");
string_trim(lines[i]);
}
title = lines[0];
if (lines[1].size())
panic("Expected empty line");
std::string col_line = lines[2];
std::string col_seps = lines[3];
if ((!col_seps.size()) || (col_seps[0] != '-') || (col_seps.back() != '-'))
panic("Invalid column seperator line");
for (uint32_t i = 0; i < col_seps.size(); i++)
{
const uint8_t c = col_seps[i];
if ((c != ' ') && (c != '-'))
panic("Invalid column separator line");
}
int col_sep_start = 0;
std::vector< std::pair<uint32_t, uint32_t> > column_info; // start offset and len of each column in chars
for (uint32_t i = 1; i < col_seps.size(); i++)
{
const uint8_t c = col_seps[i];
if (c == ' ')
{
if (col_sep_start != -1)
{
column_info.push_back(std::make_pair(col_sep_start, i - col_sep_start));
col_sep_start = -1;
}
}
else
{
if (col_sep_start == -1)
col_sep_start = i;
}
}
if (col_sep_start != -1)
{
column_info.push_back(std::make_pair(col_sep_start, (uint32_t)col_seps.size() - col_sep_start));
col_sep_start = -1;
}
if (!column_info.size())
panic("No columns found");
col_titles.resize(column_info.size());
for (uint32_t i = 0; i < column_info.size(); i++)
{
col_titles[i] = col_line;
if (column_info[i].first)
col_titles[i].erase(0, column_info[i].first);
if (column_info[i].second > col_titles[i].size())
panic("invalid columns");
col_titles[i].erase(column_info[i].second, col_titles[i].size() - column_info[i].second);
string_trim(col_titles[i]);
}
for (uint32_t i = 0; i < column_info.size() - 1; i++)
column_info[i].second = column_info[i + 1].first - column_info[i].first;
column_info.back().second = 32000;
uint32_t cur_line = 4;
uint32_t cur_record_index = 0;
while (cur_line < lines.size())
{
string_vec rec_lines;
rec_lines.push_back(lines[cur_line++]);
if (empty_line_seps)
{
while (cur_line < lines.size())
{
if (!lines[cur_line].size())
break;
rec_lines.push_back(lines[cur_line++]);
}
// cur_line should be blank, or we're at the end of the file
if (cur_line < lines.size())
{
cur_line++;
if (cur_line < lines.size())
{
if (!lines[cur_line].size())
panic("Expected non-empty line");
}
}
}
//uprintf("%u:\n", cur_record_index);
//for (uint32_t i = 0; i < rec_lines.size(); i++)
// uprintf("%s\n", rec_lines[i].c_str());
string_vec col_lines(column_info.size());
for (uint32_t col_index = 0; col_index < column_info.size(); col_index++)
{
for (uint32_t l = 0; l < rec_lines.size(); l++)
{
std::string col_text(extract_column_text(rec_lines[l], column_info[col_index].first, column_info[col_index].second));
if (col_text.size())
{
if (col_lines[col_index].size())
{
if ((col_lines[col_index].back() != '-') && ((uint8_t)col_lines[col_index].back() != ANSI_SOFT_HYPHEN))
col_lines[col_index].push_back(' ');
else
{
if ((col_lines[col_index].size() >= 2) && (!isdigit((uint8_t)col_lines[col_index][col_lines[col_index].size() - 2])))
col_lines[col_index].pop_back();
}
}
col_lines[col_index] += col_text;
}
}
}
if (pExtra_col_text)
col_lines.push_back(pExtra_col_text);
// Convert from ANSI (code page 252) to UTF8.
for (auto& l : col_lines)
l = ansi_to_utf8(l);
rows.push_back(col_lines);
cur_record_index++;
}
return true;
}
bool invoke_curl(const std::string& args, string_vec& reply)
{
reply.clear();
remove("__temp.html");
// Invoke curl.exe
std::string cmd(string_format("curl.exe \"%s\" -o __temp.html", args.c_str()));
uprintf("Command: %s\n", cmd.c_str());
int status = system(cmd.c_str());
uprintf("curl returned status %i\n", status);
if (status != EXIT_SUCCESS)
return false;
// Read output file.
FILE* pFile = ufopen("__temp.html", "rb");
if (!pFile)
{
Sleep(50);
pFile = ufopen("__temp.html", "rb");
if (!pFile)
return false;
}
uint8_t buf[6] = { 0,0,0,0,0,0 };
fread(buf, 5, 1, pFile);
fclose(pFile);
// Try to detect some common binary file types
// PDF
if (memcmp(buf, "%PDF-", 5) == 0)
{
uprintf("PDF file detected\n");
std::string filename(args);
for (size_t i = filename.size() - 1; i >= 0; i--)
{
if (filename[i] == '/')
{
filename.erase(0, i + 1);
break;
}
}
std::string new_link_deescaped;
for (uint32_t i = 0; i < filename.size(); i++)
{
uint8_t c = filename[i];
if ((c == '%') && ((i + 2) < filename.size()))
{
int da = convert_hex_digit(filename[i + 1]);
int db = convert_hex_digit(filename[i + 2]);
if (da >= 0 && db >= 0)
{
int val = da * 16 + db;
new_link_deescaped.push_back((uint8_t)val);
}
i += 2;
}
else
new_link_deescaped.push_back(c);
}
rename("__temp.html", new_link_deescaped.c_str());
uprintf("Renamed __temp.html to %s\n", new_link_deescaped.c_str());
return true;
}
// JPEG
if (memcmp(buf, "\xFF\xD8\xFF\xE0", 4) == 0)
{
uprintf("JPEG file detected\n");
return true;
}
if (!read_text_file("__temp.html", reply, true, nullptr))
{
// Wait a bit and try again, rarely needed under Windows.
Sleep(50);
if (!read_text_file("__temp.html", reply, true, nullptr))
return false;
}
return true;
}
void convert_args_to_utf8(string_vec& args, int argc, wchar_t* argv[])
{
args.resize(argc);
for (int i = 0; i < argc; i++)
{
args[i] = wchar_to_utf8(argv[i]);
if (args[i].size() >= 2)
{
if ((args[i][0] == '\"') && (args[i].back() == '\"'))
{
args[i].pop_back();
args[i].erase(0, 1);
}
}
}
}
std::string string_slice(const std::string& str, size_t ofs, size_t len)
{
if (!len)
return "";
if (ofs > str.size())
{
assert(0);
return "";
}
const size_t max_len = str.size() - ofs;
len = std::min(len, max_len);
std::string res(str);
if (ofs)
res.erase(0, ofs);
if (len)
res.resize(len);
return res;
}
bool invoke_openai(const std::string& prompt, std::string& reply)
{
reply.clear();
// Write prompt to i.txt
FILE* pFile = ufopen("i.txt", "wb");
fwrite(prompt.c_str(), prompt.size(), 1, pFile);
fclose(pFile);
// Invoke openai.exe
int status = system("openai.exe i.txt o.txt");
if (status != EXIT_SUCCESS)
return false;
// Read output file.
string_vec lines;
if (!read_text_file("o.txt", lines, true, nullptr))
{
// Wait a bit and try again, rarely needed under Windows.
Sleep(50);
if (!read_text_file("o.txt", lines, true, nullptr))
return false;
}
// Skip any blank lines at the beginning of the reply.
uint32_t i;
for (i = 0; i < lines.size(); i++)
{
std::string s(lines[i]);
string_trim(s);
if (s.size())
break;
}
for (; i < lines.size(); i++)
reply += lines[i];
return true;
}
bool invoke_openai(const string_vec &prompt, string_vec &reply)
{
reply.clear();
if (!write_text_file("i.txt", prompt, true))
return false;
// Invoke openai.exe
const uint32_t MAX_TRIES = 3;
uint32_t num_tries;
for (num_tries = 0; num_tries < MAX_TRIES; ++num_tries)
{
if (num_tries)
uprintf("openai.exe failed - retrying\n");
int status = system("openai.exe i.txt o.txt");
if (status == EXIT_SUCCESS)
break;
Sleep(2000);
}
if (num_tries == MAX_TRIES)
return false;
// Read output file.
if (!read_text_file("o.txt", reply, true, nullptr))
{
// Wait a bit and try again, rarely needed under Windows.
Sleep(50);
if (!read_text_file("o.txt", reply, true, nullptr))
return false;
}
return true;
}
std::string get_deg_to_dms(double deg)
{
deg = std::round(fabs(deg) * 3600.0f);
int min_secs = (int)fmod(deg, 3600.0f);
deg = std::floor((deg - (double)min_secs) / 3600.0f);
int minutes = min_secs / 60;
int secs = min_secs % 60;
return string_format("%02i%:%02i:%02i", (int)deg, minutes, secs);
}
bool load_json_object(const char* pFilename, bool& utf8_flag, json &result_obj)
{
std::vector<uint8_t> buf;
if (!read_text_file(pFilename, buf, &utf8_flag))
return false;
if (!buf.size())
return false;
bool success = false;
try
{
result_obj = json::parse(buf.begin(), buf.end());
success = true;
}
catch (json::exception& e)
{
fprintf(stderr, "load_json_object: Parse of file \"%s\" failed (id %i): %s", pFilename, e.id, e.what());
return false;
}
if (!result_obj.is_object() && !result_obj.is_array())
return false;
return true;
}
void string_tokenize(
const std::string &str,
const std::string &whitespace,
const std::string &break_chars,
string_vec &tokens,
uint_vec *pOffsets_vec)
{
tokens.resize(0);
if (pOffsets_vec)
pOffsets_vec->resize(0);
std::string cur_token;
uint32_t cur_ofs = 0;
for (uint32_t i = 0; i < str.size(); i++)
{
uint8_t c = str[i];
if (whitespace.find_first_of(c) != std::string::npos)
{
if (cur_token.size())
{
tokens.push_back(cur_token);
if (pOffsets_vec)
pOffsets_vec->push_back(cur_ofs);
cur_token.clear();
}
}
else if (break_chars.find_first_of(c) != std::string::npos)
{
if (cur_token.size())
{
tokens.push_back(cur_token);
if (pOffsets_vec)
pOffsets_vec->push_back(cur_ofs);
cur_token.clear();
}
std::string s;
s.push_back(c);
tokens.push_back(s);
if (pOffsets_vec)
pOffsets_vec->push_back(i);
}
else
{
if (!cur_token.size())
cur_ofs = i;
cur_token.push_back(c);
}
}
if (cur_token.size())
{
tokens.push_back(cur_token);
if (pOffsets_vec)
pOffsets_vec->push_back(cur_ofs);
}
}
const double PI = 3.141592653589793238463;
double deg2rad(double deg)
{
return (deg * PI / 180.0);
}
double rad2deg(double rad)
{
return (rad * 180.0 / PI);
}
// input in degrees
double geo_distance(double lat1, double lon1, double lat2, double lon2, int unit)
{
if ((lat1 == lat2) && (lon1 == lon2))
return 0;
double theta = lon1 - lon2;
double dist = cos(deg2rad(lat1)) * cos(deg2rad(lat2)) * cos(deg2rad(theta)) + sin(deg2rad(lat1)) * sin(deg2rad(lat2));
dist = acos(dist);
dist = rad2deg(dist);
dist = dist * 60 * 1.1515;
switch (unit)
{
case 'M':
break;
case 'K':
dist = dist * 1.609344;
break;
case 'N':
dist = dist * 0.8684;
break;
default:
assert(0);
break;
}
return (dist);
}
std::string remove_bom(std::string str)
{
if (str.size() >= 3)
{
if (((uint8_t)str[0] == UTF8_BOM0) && ((uint8_t)str[1] == UTF8_BOM1) && ((uint8_t)str[2] == UTF8_BOM2))
{
str.erase(0, 3);
}
}
return str;
}
int get_next_utf8_code_point_len(const uint8_t* pStr)
{
if (pStr == nullptr || *pStr == 0)
{
// Return 0 if the input is null or points to a null terminator
return 0;
}
const uint8_t firstByte = *pStr;
if ((firstByte & 0x80) == 0)
{
// Starts with 0, ASCII character
return 1;
}
else if ((firstByte & 0xE0) == 0xC0)
{
// Starts with 110
return 2;
}
else if ((firstByte & 0xF0) == 0xE0)
{
// Starts with 1110
return 3;
}
else if ((firstByte & 0xF8) == 0xF0)
{
// Starts with 11110
return 4;
}
else
{
// Invalid UTF-8 byte sequence
return -1;
}
}
void get_string_words(
const std::string& str,
string_vec& words,
uint_vec* pOffsets_vec,
const char* pAdditional_whitespace)
{
const uint8_t* pStr = (const uint8_t *)str.c_str();
words.resize(0);
if (pOffsets_vec)
pOffsets_vec->resize(0);
std::string cur_token;
std::string whitespace(" \t\n\r,;:.!?()[]*/\"");
if (pAdditional_whitespace)
whitespace += std::string(pAdditional_whitespace);
int word_start_ofs = -1;
uint32_t cur_ofs = 0;
while ((cur_ofs < str.size()) && (pStr[cur_ofs]))
{
int l = get_next_utf8_code_point_len(pStr + cur_ofs);
const uint8_t c = pStr[cur_ofs];
if (l <= 0)
{
assert(0);
l = 1;
}
bool is_whitespace = (whitespace.find_first_of(c) != std::string::npos);
if ((l == 2) && (c == 0xc2))
{
// NO-BREAK SPACE
if (pStr[cur_ofs + 1] == 0xa0)
is_whitespace = true;
}
if ((l == 2) && (c == 0xCA))
{
// single left quote
if (pStr[cur_ofs + 1] == 0xBB)
is_whitespace = true;
}
if ((l == 3) && (c == 0xE2) && (pStr[cur_ofs + 1] == 0x80))
{
// dash
if (pStr[cur_ofs + 2] == 0x93)
is_whitespace = true;
// dash
else if (pStr[cur_ofs + 2] == 0x94)
is_whitespace = true;
// left quote
else if (pStr[cur_ofs + 2] == 0x9C)
is_whitespace = true;
// right quote
else if (pStr[cur_ofs + 2] == 0x9D)
is_whitespace = true;
// ellipsis (three dots)
else if (pStr[cur_ofs + 2] == 0xA)
is_whitespace = true;
// ellipsis (three dots)
else if (pStr[cur_ofs + 2] == 0xA6)
is_whitespace = true;
// long dash
else if (pStr[cur_ofs + 2] == 9)
is_whitespace = true;
// left single quote
else if (pStr[cur_ofs + 2] == 0x98)
is_whitespace = true;
// right single quote
else if (pStr[cur_ofs + 2] == 0x99)
is_whitespace = true;
// right double quote
else if (pStr[cur_ofs + 2] == 0x9D)
is_whitespace = true;
}
if (is_whitespace)
{
if (cur_token.size())
{
words.push_back(cur_token);
if (pOffsets_vec)
pOffsets_vec->push_back(word_start_ofs);
cur_token.clear();
word_start_ofs = -1;
}
}
else
{
if (word_start_ofs < 0)
word_start_ofs = cur_ofs;
if (l == 1)
{
cur_token.push_back(utolower(c));
}
else
{
for (int i = 0; i < l; i++)
cur_token.push_back(pStr[cur_ofs + i]);
}
}
cur_ofs += l;
}
if (cur_token.size())
{
words.push_back(cur_token);
if (pOffsets_vec)
pOffsets_vec->push_back(word_start_ofs);
}
}
void get_utf8_code_point_offsets(const char* pStr, int_vec& offsets)
{
uint32_t cur_ofs = 0;
offsets.resize(0);
while (pStr[cur_ofs])
{
offsets.push_back(cur_ofs);
cur_ofs += std::max<int>(1, get_next_utf8_code_point_len((const uint8_t*)pStr + cur_ofs));
}
}
struct char_map
{
const char32_t* m_pFrom;
const char m_to;
};
static const char_map g_char_norm_up[] =
{
{ U"ÁĂẮẶẰẲẴǍÂẤẬẦẨẪÄǞȦǠẠȀÀẢȂĀĄÅǺḀÃǼǢȺΆ", 'A' },
{ U"ḂḄḆƁƂƄ", 'B' },
{ U"ĆČÇḈĈĊƇȻƆ", 'C' },
{ U"ĎḐḒḊḌḎĐƉƊƋDZDzDŽ", 'D' },
{ U"ÉĔĚȨḜÊẾỆỀỂỄḘËĖẸȄÈẺȆĒḖḔĘẼḚÈÊËĒĔĖĘĚƐƎƏȄȆȨΈΉΕƐƐ", 'E' },
{ U"ḞƑ", 'F' },
{ U"ǴĞǦĢĜĠḠĜĞĠĢƓǤǦǴƔ", 'G' },
{ U"ḪȞḨĤḦḢḤĤĦǶȞΗǶ", 'H' },
{ U"ÍĬǏÎÏḮİỊȈÌỈȊĪĮĨḬÌÍÎÏĨĪĬĮİƗǏȈȊ", 'I' },
{ U"ĴĴ", 'J' },
{ U"ḰǨĶḲḴĶƘǨΚ", 'K' },
{ U"ĹĽĻḼḶḸḺĹĻĽĿŁΛ", 'L' },
{ U"ḾṀṂƜ", 'M' },
{ U"ŃŇŅṊṄṆǸṈÑÑŃŅŇŊƝǸΝ", 'N' },
{ U"ÓŎǑÔỐỘỒỔỖÖȪȮȰỌŐȌÒỎƠỚỢỜỞỠȎŌṒṐǪǬÕṌṎȬǾØÒÓÔÕÖØŌŎŐƟƠǑǪǬǾȌȎȪȬȮȰΌΟΩ", 'O' },
{ U"ṔṖΠΡΦ", 'P' },
{ U"ŔŘŖṘṚṜȐȒṞŔŖŘƦȐȒ", 'R' },
{ U"ŚṤŠṦŞŜȘṠṢṨߌŜŞŠƩȘΣ", 'S' },
{ U"ŤŢṰȚṪṬṮŢŤŦƬƮȚΤ", 'T' },
{ U"ÚŬǓÛṶÜǗǙǛǕṲỤŰȔÙỦƯỨỰỪỬỮȖŪṺŲŮŨṸṴÙÚÛÜŨŪŬŮŰŲƯǓǕǗǙǛȔȖ", 'U' },
{ U"ṾṼƲ", 'V' },
{ U"ẂŴẄẆẈẀŴ", 'W' },
{ U"ẌẊΧΞ", 'X' },
{ U"ÝŶŸẎỴỲỶȲỸÝŶŸƳȲΥΎΫ", 'Y' },
{ U"ŹŽẐŻẒẔŹŻŽƵƷǮȤΖ", 'Z' },
};
static const char_map g_char_norm_lower[] =
{
{ U"áăắặằẳẵǎâấậầẩẫäǟȧǡạȁàảȃāąåǻḁãǽǣⱥάàáâãäåāăąǎǟǡǻȁȃȧάα", 'a' },
{ U"ḃḅḇɓƃƅƀƃβƀƃƅ", 'b' },
{ U"ćčçḉĉċƈȼɔƈçćĉċčƈȼ", 'c' },
{ U"ďḑḓḋḍḏđɖɗƌdzdzdžƌďđƌdzdžȡďđƌdzdžȡ", 'd' },
{ U"éĕěȩḝêếệềểễḙëėẹȅèẻȇēḗḕęẽḛèêëēĕėęěɛǝəȅȇȩέήεɛɛèéêëēĕėęěȅȇȩε", 'e' },
{ U"ḟƒ", 'f' },
{ U"ǵğǧģĝġḡĝğġģɠǥǧǵɣĝğġģǧǵ", 'g' },
{ U"ḫȟḩĥḧḣḥẖĥħƕƕȟƕĥħȟ", 'h' },
{ U"íĭǐîïḯiịȉìỉȋīįĩḭìíîïĩīĭįiɨǐȉȋìíîïĩīĭįǐȉȋι", 'i' },
{ U"ǰĵĵǰĵǰ", 'j' },
{ U"ḱǩķḳḵķƙǩκƙķƙǩκ", 'k' },
{ U"ĺľļḽḷḹḻĺļľŀłƚƛλƚĺļľŀłƚλƚ", 'l' },
{ U"ḿṁṃɯ", 'm' },
{ U"ńňņṋṅṇǹṉññńņňŋɲǹνƞñńņňʼnŋƞǹη", 'n' },
{ U"óŏǒôốộồổỗöȫȯȱọőȍòỏơớợờởỡȏōṓṑǫǭõṍṏȭǿøòóôõöøōŏőɵơǒǫǭǿȍȏȫȭȯȱόοòóôõöøōŏőơǒǫǭǿȍȏȫȭȯȱοσ", 'o' },
{ U"ṕṗπφƥ", 'p' },
{ U"ŕřŗṙṛṝȑȓṟŕŗřʀȑȓρŕŗřȑȓρ", 'r' },
{ U"śṥšṧşŝșṡẛṣṩśŝşšʃșƨśŝşšșƨȿ", 's' },
{ U"ťţṱțẗṫṭṯţťŧƭʈțτƫţťŧƭțτ", 't' },
{ U"úŭǔûṷüǘǚǜǖṳụűȕùủưứựừửữȗūṻųůũṹṵùúûüũūŭůűųưǔǖǘǚǜȕȗưùúûüũūŭůűųưǔǖǘǚǜȕȗμ", 'u' },
{ U"ṿṽʋ", 'v' },
{ U"ẃŵẅẇẉẁẘŵŵω", 'w' },
{ U"ẍẋχξχξ", 'x' },
{ U"ýŷÿẏỵỳỷȳẙỹýŷÿƴȳυύϋƴýÿŷƴȳγψ", 'y' },
{ U"źžẑżẓẕźżžƶʒǯȥζƶźżžƶƹȥζ", 'z' },
};
std::map<int, int> g_upper_trans;
std::map<int, int> g_lower_trans;
static const char* g_stop_words[] =
{
"a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as",
"at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can",
"could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from",
"further", "had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him", "himself",
"his", "how", "i", "if", "in", "into", "is", "it", "its", "itself", "just", "me", "more", "most",
"my", "myself", "no", "nor", "not", "now", "of", "off", "on", "once", "only", "or", "other", "our",
"ours", "ourselves", "out", "over", "own", "re", "same", "she", "should", "so", "some", "such",
"than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they",
"this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "were", "what",
"when", "where", "which", "while", "who", "whom", "why", "will", "with", "you", "your", "yours",
"yourself", "yourselves", "although", "also", "already", "another", "seemed", "seem", "seems"
};
static const uint32_t NUM_STOP_WORDS = (uint32_t)std::size(g_stop_words);
std::set<std::string> g_stop_words_set;
void init_norm()
{
g_stop_words_set.clear();
for (const auto& str : g_stop_words)
g_stop_words_set.insert(str);
for (uint32_t i = 0; i < std::size(g_char_norm_up); i++)
{
const char32_t* pFrom = g_char_norm_up[i].m_pFrom;
char to_char = g_char_norm_up[i].m_to;
while (*pFrom)
{
char32_t fc = *pFrom++;
auto f = g_upper_trans.find(fc);
if (f != g_upper_trans.end())
{
if (f->second != to_char)
{
uprintf("Upper char %u 0x%x is redundant\n", fc, fc);
exit(1);
}
}
g_upper_trans[fc] = to_char;
}
}
for (uint32_t i = 0; i < std::size(g_char_norm_lower); i++)
{
const char32_t* pFrom = g_char_norm_lower[i].m_pFrom;
char to_char = g_char_norm_lower[i].m_to;
while (*pFrom)
{
char32_t fc = *pFrom++;
auto f = g_upper_trans.find(fc);
if (f != g_upper_trans.end())
{
uprintf("Lower char %u 0x%x is in the upper table\n", fc, fc);
if (utolower((uint8_t)f->second) != to_char)
uprintf("Conversion mismatch %u 0x%x\n", fc, fc);
//exit(1);
}
f = g_lower_trans.find(fc);
if (f != g_lower_trans.end())
{
if (f->second != to_char)
{
uprintf("Lower char %u 0x%x is redundant\n", fc, fc);
exit(1);
}
}
g_lower_trans[fc] = to_char;
}
}
}
// Resulting characters are guaranteed to be <128 - useful for searching purposes.
// Unrecognized Unicode characters are deleted.
void normalize_diacritics(const char* pStr, std::string& res)
{
assert(g_stop_words_set.size());
res.resize(0);
while (*pStr)
{
int l = get_next_utf8_code_point_len((const uint8_t*)pStr);
const uint8_t c = *pStr;
utf8_int32_t cp;
char* pStr_next = utf8codepoint(pStr, &cp);
assert((pStr_next - pStr) == l);
if (cp < 128)
{
res.push_back((char)cp);
pStr = pStr_next;
continue;
}
int new_char = -1;
auto u_it = g_upper_trans.find(cp);
auto l_it = g_lower_trans.find(cp);
if (u_it != g_upper_trans.end())
new_char = u_it->second;
else if (l_it != g_lower_trans.end())
new_char = l_it->second;
else
{
// FIXME: this is lame, it parses the utf8 directly.
if ((l == 2) && (c == 0xc2))
{
// NO-BREAK SPACE
if ((uint8_t)pStr[1] == 0xa0)
new_char = ' ';
}
if ((l == 2) && (c == 0xCA))
{
// single left quote
if ((uint8_t)pStr[1] == 0xBB)
new_char = '\'';
}
if ((l == 3) && (c == 0xE2) && ((uint8_t)pStr[1] == 0x80))
{
// dash
if ((uint8_t)pStr[2] == 0x93)
new_char = '-';
// dash
else if ((uint8_t)pStr[2] == 0x94)
new_char = '-';
// left quote
else if ((uint8_t)pStr[2] == 0x9C)
new_char = '"';
// right quote
else if ((uint8_t)pStr[2] == 0x9D)
new_char = '"';
// ellipsis (three dots)
else if ((uint8_t)pStr[2] == 0xA)
new_char = '.';
// ellipsis (three dots)
else if ((uint8_t)pStr[2] == 0xA6)
new_char = '.';
// long dash
else if ((uint8_t)pStr[2] == 9)
new_char = '-';
// left single quote
else if ((uint8_t)pStr[2] == 0x98)
new_char = '\'';
// right single quote
else if ((uint8_t)pStr[2] == 0x99)
new_char = '\'';
// right double quote
else if ((uint8_t)pStr[2] == 0x9D)
new_char = '"';
}
}
// TODO: Do something smarter?
if (new_char != -1)
res.push_back((char)new_char);
pStr = pStr_next;
}
}
std::string normalize_word(const std::string& str)
{
assert(g_stop_words_set.size());
const uint32_t MAX_STRING_SIZE = 4096;
if (str.size() > MAX_STRING_SIZE)
panic("String too long");
char buf[MAX_STRING_SIZE + 1];
strcpy_s(buf, sizeof(buf), str.c_str());
// Convert utf8 string to lower
utf8lwr(buf);
// Remove diacritics and some specials from utf8, this preserves all 1-127 chars
std::string norm;
norm.reserve(strlen(buf));
normalize_diacritics(buf, norm);
// Remove any non-letter or non-digit characters (we assume this is a word, so whitespace gets removed too)
std::string temp;
temp.reserve(norm.size());
for (uint32_t i = 0; i < norm.size(); i++)
{
uint8_t c = norm[i];
c = utolower(c);
if (uislower(c) || uisdigit(c))
temp.push_back(c);
}
// Stem word
strcpy_s(buf, sizeof(buf), temp.c_str());
if (buf[0])
{
int32_t new_len = stem(buf, 0, (int)strlen(buf) - 1);
buf[new_len + 1] = '\0';
}
return buf;
}
// Assumes word is plain ASCII lowercase
bool is_stop_word(const std::string &word)
{
assert(g_stop_words_set.size());
return g_stop_words_set.count(word) != 0;
}
std::string ustrlwr(const std::string& s)
{
const size_t l = s.size();
std::vector<uint8_t> temp;
temp.resize(l + 1);
memcpy(&temp[0], s.c_str(), l);
temp[l] = '\0';
utf8lwr((char *)&temp[0]);
return (char *)&temp[0];
}
std::string string_replace(const std::string& str, const std::string& find, const std::string& repl)
{
assert(find.size());
if (!find.size() || !str.size())
return str;
const uint8_t* pStr = (const uint8_t *)str.c_str();
const size_t str_size = str.size();
const uint8_t* pFind = (const uint8_t*)find.c_str();
const size_t find_size = find.size();
std::string res;
res.reserve(str.size());
size_t str_ofs = 0;
while (str_ofs < str.size())
{
int str_char_size = get_next_utf8_code_point_len(pStr + str_ofs);
if (str_char_size < 0)
{
assert(0);
str_char_size = 1;
}
const size_t str_remaining = str_size - str_ofs;
if ((str_remaining >= find_size) && (memcmp(pStr + str_ofs, pFind, find_size) == 0))
{
res += repl;
str_ofs += find_size;
}
else
{
for (int i = 0; i < str_char_size; i++)
res.push_back((char)pStr[str_ofs + i]);
str_ofs += str_char_size;
}
}
return res;
}
bool does_file_exist(const char* pFilename)
{
FILE* pFile = ufopen(pFilename, "rb");
if (!pFile)
return false;
fclose(pFile);
return true;
}