// utils.cpp // Copyright (C) 2023 Richard Geldreich, Jr. #include "utils.h" #include "utf8.h" #include "stem.h" std::string combine_strings(std::string a, const std::string& b) { if (!a.size()) return b; if (!b.size()) return a; if (a.back() == '-') { if ((a.size() >= 2) && isdigit((uint8_t)a[a.size() - 2])) { } else { a.pop_back(); a += b; } } else { if (a.back() != ' ') a += " "; a += b; } return a; } std::wstring utf8_to_wchar(const std::string& str, UINT code_page) { if (str.empty()) return std::wstring(); int size_needed = MultiByteToWideChar(code_page, 0, &str[0], (int)str.size(), NULL, 0); if (!size_needed) return std::wstring(); std::wstring wstrTo(size_needed, 0); int res = MultiByteToWideChar(code_page, 0, &str[0], (int)str.size(), &wstrTo[0], size_needed); if (!res) return std::wstring(); return wstrTo; } std::string wchar_to_utf8(const std::wstring& wstr, UINT code_page) { if (wstr.empty()) return std::string(); int size_needed = WideCharToMultiByte(code_page, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL); if (!size_needed) return std::string(); std::string strTo(size_needed, 0); int res = WideCharToMultiByte(code_page, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL); if (!res) return std::string(); return strTo; } static uint16_t g_codepage_437_to_unicode_0_31[32] = { ' ', 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022, 0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C, 0x25BA, 0x25C4, 0x2195, 0x203C, 0x00B6, 0x00A7, 0x25AC, 0x21A8, 0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC }; static uint16_t g_codepage_437_to_unicode_128_255[129] = { 0x2302, 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7, 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5, 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9, 0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192, 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9, 0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192, 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510, 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567, 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580, 0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4, 0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x03B5, 0x2229, 0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248, 0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0 }; // Code page 437 to utf8. WideCharToMultiByte etc. doesn't do the expecting thing for chars<32, and we need them. std::string dos_to_utf8(const std::string& str) { std::wstring wstr; for (uint8_t c : str) { if (c < 32) wstr.push_back(g_codepage_437_to_unicode_0_31[c]); else if (c >= 127) wstr.push_back(g_codepage_437_to_unicode_128_255[c - 127]); else wstr.push_back(c); } return wchar_to_utf8(wstr); } bool vformat(std::vector& buf, const char* pFmt, va_list args) { uint32_t buf_size = 8192; for (; ; ) { buf.resize(buf_size); int res = vsnprintf(&buf[0], buf.size(), pFmt, args); if (res == -1) { assert(false); return false; } if (res <= buf.size() - 1) break; buf_size *= 2; if (buf_size > 16 * 1024 * 1024) { assert(false); return false; } } return true; } void ufprintf(FILE* pFile, const char* pFmt, ...) { std::vector buf; va_list args; va_start(args, pFmt); if (!vformat(buf, pFmt, args)) return; va_end(args); std::wstring wbuf(utf8_to_wchar(std::string(&buf[0]))); // Not thread safe, but we don't care _setmode(_fileno(pFile), _O_U16TEXT); fputws(&wbuf[0], pFile); _setmode(_fileno(pFile), _O_TEXT); } void uprintf(const char* pFmt, ...) { std::vector buf; va_list args; va_start(args, pFmt); if (!vformat(buf, pFmt, args)) return; va_end(args); std::wstring wbuf(utf8_to_wchar(std::string(&buf[0]))); // Not thread safe, but we don't care _setmode(_fileno(stdout), _O_U16TEXT); fputws(&wbuf[0], stdout); _setmode(_fileno(stdout), _O_TEXT); } std::string string_format(const char* pMsg, ...) { std::vector buf; va_list args; va_start(args, pMsg); if (!vformat(buf, pMsg, args)) return ""; va_end(args); std::string res; if (buf.size()) res.assign(&buf[0]); return res; } void panic(const char* pMsg, ...) { char buf[4096]; va_list args; va_start(args, pMsg); vsnprintf(buf, sizeof(buf), pMsg, args); va_end(args); ufprintf(stderr, "%s", buf); exit(EXIT_FAILURE); } FILE* ufopen(const char* pFilename, const char* pMode) { std::wstring wfilename(utf8_to_wchar(pFilename)); std::wstring wmode(utf8_to_wchar(pMode)); if (!wfilename.size() || !wmode.size()) return nullptr; FILE* pRes = nullptr; _wfopen_s(&pRes, &wfilename[0], &wmode[0]); return pRes; } std::string& string_trim(std::string& str) { while (str.size() && isspace((uint8_t)str.back())) str.pop_back(); while (str.size() && isspace((uint8_t)str[0])) str.erase(0, 1); return str; } std::string& string_trim_end(std::string& str) { while (str.size() && isspace((uint8_t)str.back())) str.pop_back(); return str; } // Case sensitive, returns -1 if can't find int string_find_first(const std::string& str, const char* pPhrase) { size_t res = str.find(pPhrase, 0); if (res == std::string::npos) return -1; return (int)res; } // Case insensitive, returns -1 if can't find int string_ifind_first(const std::string& str, const char* pPhrase) { const size_t str_size = str.size(); const size_t phrase_size = strlen(pPhrase); assert((int)str_size == str_size); assert((int)phrase_size == phrase_size); assert(phrase_size); if ((!str_size) || (!phrase_size) || (phrase_size > str_size)) return -1; const size_t end_ofs = str_size - phrase_size; for (size_t ofs = 0; ofs <= end_ofs; ofs++) { assert(ofs + phrase_size <= str_size); if (_strnicmp(str.c_str() + ofs, pPhrase, phrase_size) == 0) return (int)ofs; } return -1; } int string_icompare(const std::string& a, const char* pB) { const size_t a_len = a.size(); const size_t b_len = strlen(pB); const size_t min_len = std::min(a_len, b_len); for (size_t i = 0; i < min_len; i++) { const int ac = (uint8_t)utolower(a[i]); const int bc = (uint8_t)utolower(pB[i]); if (ac != bc) return (ac < bc) ? -1 : 1; } if (a_len == b_len) return 0; return (a_len < b_len) ? -1 : 1; } bool string_begins_with(const std::string& str, const char* pPhrase) { const size_t str_len = str.size(); const size_t phrase_len = strlen(pPhrase); assert(phrase_len); if (str_len >= phrase_len) { if (_strnicmp(pPhrase, str.c_str(), phrase_len) == 0) return true; } return false; } bool string_ends_in(const std::string& str, const char* pPhrase) { const size_t str_len = str.size(); const size_t phrase_len = strlen(pPhrase); assert(phrase_len); if (str_len >= phrase_len) { if (_stricmp(pPhrase, str.c_str() + str_len - phrase_len) == 0) return true; } return false; } std::string encode_url(const std::string& url) { //const char* pValid_chars = ";,/?:@&=+$-_.!~*'()#"; //const size_t valid_chars_len = strlen(pValid_chars); std::string res; for (uint32_t i = 0; i < url.size(); i++) { uint8_t c = (uint8_t)url[i]; //const bool is_digit = (c >= 0) && (c <= '9'); //const bool is_upper = (c >= 'A') && (c <= 'Z'); //const bool is_lower = (c >= 'a') && (c <= 'z'); // Escape some problematic charactes that confuse some Markdown parsers (even after using Markdown '\' escapes) if ((c == ')') || (c == '(') || (c == '_') || (c == '*')) { res.push_back('%'); res.push_back(to_hex(c / 16)); res.push_back(to_hex(c % 16)); continue; } res.push_back(c); } return res; } // TODO uint32_t crc32(const uint8_t* pBuf, size_t size, uint32_t init_crc) { uint32_t crc = ~init_crc; for (size_t i = 0; i < size; i++) { const uint32_t byte = pBuf[i]; crc = crc ^ byte; for (int j = 7; j >= 0; j--) { uint32_t mask = -((int)(crc & 1)); crc = (crc >> 1) ^ (0xEDB88320 & mask); } } return ~crc; } uint32_t hash_hsieh(const uint8_t* pBuf, size_t len) { if (!pBuf || !len) return 0; uint32_t h = static_cast(len); const uint32_t bytes_left = len & 3; len >>= 2; while (len--) { const uint16_t* pWords = reinterpret_cast(pBuf); h += pWords[0]; const uint32_t t = (pWords[1] << 11) ^ h; h = (h << 16) ^ t; pBuf += sizeof(uint32_t); h += h >> 11; } switch (bytes_left) { case 1: h += *reinterpret_cast(pBuf); h ^= h << 10; h += h >> 1; break; case 2: h += *reinterpret_cast(pBuf); h ^= h << 11; h += h >> 17; break; case 3: h += *reinterpret_cast(pBuf); h ^= h << 16; h ^= (static_cast(pBuf[sizeof(uint16_t)])) << 18; h += h >> 11; break; default: break; } h ^= h << 3; h += h >> 5; h ^= h << 4; h += h >> 17; h ^= h << 25; h += h >> 6; return h; } bool read_binary_file(const char* pFilename, uint8_vec& buf) { const uint64_t MAX_BINARY_FILE_LEN = 168ULL * 1024ULL * (1024ULL * 1024ULL); FILE* pFile = ufopen(pFilename, "rb"); if (!pFile) return false; _fseeki64(pFile, 0, SEEK_END); int64_t len = _ftelli64(pFile); if (len < 0) { fclose(pFile); return false; } _fseeki64(pFile, 0, SEEK_SET); if (len > MAX_BINARY_FILE_LEN) return false; buf.resize(len); if (fread(&buf[0], len, 1, pFile) != 1) { fclose(pFile); return false; } fclose(pFile); return true; } bool read_text_file(const char* pFilename, string_vec& lines, bool trim_lines, bool* pUTF8_flag) { FILE* pFile = ufopen(pFilename, "r"); if (!pFile) return false; bool first_line = true; if (pUTF8_flag) *pUTF8_flag = false; while (!feof(pFile)) { char buf[16384]; char* p = fgets(buf, sizeof(buf), pFile); if (!p) { if (feof(pFile)) break; fclose(pFile); return false; } std::string str(p); if (first_line) { first_line = false; if ((str.size() >= 3) && ((uint8_t)str[0] == UTF8_BOM0) && ((uint8_t)str[1] == UTF8_BOM1) && ((uint8_t)str[2] == UTF8_BOM2)) { if (pUTF8_flag) *pUTF8_flag = true; str.erase(0, 3); } } while (str.size() && ((str.back() == '\n') || (str.back() == '\r'))) str.pop_back(); if (trim_lines) string_trim_end(str); lines.push_back(str); } fclose(pFile); return true; } bool read_text_file(const char* pFilename, std::vector& buf, bool *pUTF8_flag) { if (pUTF8_flag) *pUTF8_flag = false; FILE* pFile = ufopen(pFilename, "rb"); if (!pFile) { ufprintf(stderr, "Failed reading file %s!\n", pFilename); return false; } fseek(pFile, 0, SEEK_END); uint64_t filesize = _ftelli64(pFile); fseek(pFile, 0, SEEK_SET); buf.resize(filesize + 1); fread(&buf[0], 1, filesize, pFile); fclose(pFile); if ((buf.size() >= 3) && ((uint8_t)buf[0] == UTF8_BOM0) && ((uint8_t)buf[1] == UTF8_BOM1) && ((uint8_t)buf[2] == UTF8_BOM2)) { if (pUTF8_flag) *pUTF8_flag = true; buf.erase(buf.begin(), buf.begin() + 3); } return true; } bool write_text_file(const char* pFilename, const string_vec& lines, bool utf8_bom) { FILE* pFile = ufopen(pFilename, "wb"); if (!pFile) return false; if (utf8_bom) { if ((fputc(UTF8_BOM0, pFile) == EOF) || (fputc(UTF8_BOM1, pFile) == EOF) || (fputc(UTF8_BOM2, pFile) == EOF)) { fclose(pFile); return false; } } for (uint32_t i = 0; i < lines.size(); i++) { if (lines[i].size()) { if (fwrite(lines[i].c_str(), lines[i].size(), 1, pFile) != 1) { fclose(pFile); return false; } } if (fwrite("\r\n", 2, 1, pFile) != 1) { fclose(pFile); return false; } } if (fclose(pFile) == EOF) return false; return true; } bool serialize_to_json_file(const char* pFilename, const json& j, bool utf8_bom) { FILE* pFile = ufopen(pFilename, "wb"); if (!pFile) return false; if (utf8_bom) { if ((fputc(UTF8_BOM0, pFile) == EOF) || (fputc(UTF8_BOM1, pFile) == EOF) || (fputc(UTF8_BOM2, pFile) == EOF)) { fclose(pFile); return false; } } std::string d(j.dump(2)); if (d.size()) { if (fwrite(&d[0], d.size(), 1, pFile) != 1) { fclose(pFile); return false; } } fclose(pFile); return true; } // Note: This doesn't actually handle utf8. It assumes ANSI (code page 252) text input. static std::string extract_column_text(const std::string& str, uint32_t ofs, uint32_t len) { if (ofs >= str.size()) return ""; const uint32_t max_len = std::min((uint32_t)str.size() - ofs, len); std::string res(str); if (ofs) res.erase(0, ofs); if (max_len < res.size()) res.erase(max_len, res.size()); string_trim(res); return res; } // Note: This doesn't actually handle utf8. It assumes ANSI (code page 252) text input. bool load_column_text(const char* pFilename, std::vector& rows, std::string& title, string_vec& col_titles, bool empty_line_seps, const char* pExtra_col_text) { string_vec lines; bool utf8_flag = false; if (!read_text_file(pFilename, lines, true, &utf8_flag)) panic("Failed reading text file %s", pFilename); if (utf8_flag) panic("load_column_text() doesn't support utf8 yet"); if (!lines.size() || !lines[0].size()) panic("Expected title"); if (lines.size() < 5) panic("File too small"); for (uint32_t i = 0; i < lines.size(); i++) { if (lines[i].find_first_of(9) != std::string::npos) panic("Tab in file"); string_trim(lines[i]); } title = lines[0]; if (lines[1].size()) panic("Expected empty line"); std::string col_line = lines[2]; std::string col_seps = lines[3]; if ((!col_seps.size()) || (col_seps[0] != '-') || (col_seps.back() != '-')) panic("Invalid column seperator line"); for (uint32_t i = 0; i < col_seps.size(); i++) { const uint8_t c = col_seps[i]; if ((c != ' ') && (c != '-')) panic("Invalid column separator line"); } int col_sep_start = 0; std::vector< std::pair > column_info; // start offset and len of each column in chars for (uint32_t i = 1; i < col_seps.size(); i++) { const uint8_t c = col_seps[i]; if (c == ' ') { if (col_sep_start != -1) { column_info.push_back(std::make_pair(col_sep_start, i - col_sep_start)); col_sep_start = -1; } } else { if (col_sep_start == -1) col_sep_start = i; } } if (col_sep_start != -1) { column_info.push_back(std::make_pair(col_sep_start, (uint32_t)col_seps.size() - col_sep_start)); col_sep_start = -1; } if (!column_info.size()) panic("No columns found"); col_titles.resize(column_info.size()); for (uint32_t i = 0; i < column_info.size(); i++) { col_titles[i] = col_line; if (column_info[i].first) col_titles[i].erase(0, column_info[i].first); if (column_info[i].second > col_titles[i].size()) panic("invalid columns"); col_titles[i].erase(column_info[i].second, col_titles[i].size() - column_info[i].second); string_trim(col_titles[i]); } for (uint32_t i = 0; i < column_info.size() - 1; i++) column_info[i].second = column_info[i + 1].first - column_info[i].first; column_info.back().second = 32000; uint32_t cur_line = 4; uint32_t cur_record_index = 0; while (cur_line < lines.size()) { string_vec rec_lines; rec_lines.push_back(lines[cur_line++]); if (empty_line_seps) { while (cur_line < lines.size()) { if (!lines[cur_line].size()) break; rec_lines.push_back(lines[cur_line++]); } // cur_line should be blank, or we're at the end of the file if (cur_line < lines.size()) { cur_line++; if (cur_line < lines.size()) { if (!lines[cur_line].size()) panic("Expected non-empty line"); } } } //uprintf("%u:\n", cur_record_index); //for (uint32_t i = 0; i < rec_lines.size(); i++) // uprintf("%s\n", rec_lines[i].c_str()); string_vec col_lines(column_info.size()); for (uint32_t col_index = 0; col_index < column_info.size(); col_index++) { for (uint32_t l = 0; l < rec_lines.size(); l++) { std::string col_text(extract_column_text(rec_lines[l], column_info[col_index].first, column_info[col_index].second)); if (col_text.size()) { if (col_lines[col_index].size()) { if ((col_lines[col_index].back() != '-') && ((uint8_t)col_lines[col_index].back() != ANSI_SOFT_HYPHEN)) col_lines[col_index].push_back(' '); else { if ((col_lines[col_index].size() >= 2) && (!isdigit((uint8_t)col_lines[col_index][col_lines[col_index].size() - 2]))) col_lines[col_index].pop_back(); } } col_lines[col_index] += col_text; } } } if (pExtra_col_text) col_lines.push_back(pExtra_col_text); // Convert from ANSI (code page 252) to UTF8. for (auto& l : col_lines) l = ansi_to_utf8(l); rows.push_back(col_lines); cur_record_index++; } return true; } bool invoke_curl(const std::string& args, string_vec& reply) { reply.clear(); remove("__temp.html"); // Invoke curl.exe std::string cmd(string_format("curl.exe \"%s\" -o __temp.html", args.c_str())); uprintf("Command: %s\n", cmd.c_str()); int status = system(cmd.c_str()); uprintf("curl returned status %i\n", status); if (status != EXIT_SUCCESS) return false; // Read output file. FILE* pFile = ufopen("__temp.html", "rb"); if (!pFile) { Sleep(50); pFile = ufopen("__temp.html", "rb"); if (!pFile) return false; } uint8_t buf[6] = { 0,0,0,0,0,0 }; fread(buf, 5, 1, pFile); fclose(pFile); // Try to detect some common binary file types // PDF if (memcmp(buf, "%PDF-", 5) == 0) { uprintf("PDF file detected\n"); std::string filename(args); for (size_t i = filename.size() - 1; i >= 0; i--) { if (filename[i] == '/') { filename.erase(0, i + 1); break; } } std::string new_link_deescaped; for (uint32_t i = 0; i < filename.size(); i++) { uint8_t c = filename[i]; if ((c == '%') && ((i + 2) < filename.size())) { int da = convert_hex_digit(filename[i + 1]); int db = convert_hex_digit(filename[i + 2]); if (da >= 0 && db >= 0) { int val = da * 16 + db; new_link_deescaped.push_back((uint8_t)val); } i += 2; } else new_link_deescaped.push_back(c); } rename("__temp.html", new_link_deescaped.c_str()); uprintf("Renamed __temp.html to %s\n", new_link_deescaped.c_str()); return true; } // JPEG if (memcmp(buf, "\xFF\xD8\xFF\xE0", 4) == 0) { uprintf("JPEG file detected\n"); return true; } if (!read_text_file("__temp.html", reply, true, nullptr)) { // Wait a bit and try again, rarely needed under Windows. Sleep(50); if (!read_text_file("__temp.html", reply, true, nullptr)) return false; } return true; } void convert_args_to_utf8(string_vec& args, int argc, wchar_t* argv[]) { args.resize(argc); for (int i = 0; i < argc; i++) { args[i] = wchar_to_utf8(argv[i]); if (args[i].size() >= 2) { if ((args[i][0] == '\"') && (args[i].back() == '\"')) { args[i].pop_back(); args[i].erase(0, 1); } } } } std::string string_slice(const std::string& str, size_t ofs, size_t len) { if (!len) return ""; if (ofs > str.size()) { assert(0); return ""; } const size_t max_len = str.size() - ofs; len = std::min(len, max_len); std::string res(str); if (ofs) res.erase(0, ofs); if (len) res.resize(len); return res; } bool invoke_openai(const std::string& prompt, std::string& reply) { reply.clear(); // Write prompt to i.txt FILE* pFile = ufopen("i.txt", "wb"); fwrite(prompt.c_str(), prompt.size(), 1, pFile); fclose(pFile); // Invoke openai.exe int status = system("openai.exe i.txt o.txt"); if (status != EXIT_SUCCESS) return false; // Read output file. string_vec lines; if (!read_text_file("o.txt", lines, true, nullptr)) { // Wait a bit and try again, rarely needed under Windows. Sleep(50); if (!read_text_file("o.txt", lines, true, nullptr)) return false; } // Skip any blank lines at the beginning of the reply. uint32_t i; for (i = 0; i < lines.size(); i++) { std::string s(lines[i]); string_trim(s); if (s.size()) break; } for (; i < lines.size(); i++) reply += lines[i]; return true; } bool invoke_openai(const string_vec &prompt, string_vec &reply) { reply.clear(); if (!write_text_file("i.txt", prompt, true)) return false; // Invoke openai.exe const uint32_t MAX_TRIES = 3; uint32_t num_tries; for (num_tries = 0; num_tries < MAX_TRIES; ++num_tries) { if (num_tries) uprintf("openai.exe failed - retrying\n"); int status = system("openai.exe i.txt o.txt"); if (status == EXIT_SUCCESS) break; Sleep(2000); } if (num_tries == MAX_TRIES) return false; // Read output file. if (!read_text_file("o.txt", reply, true, nullptr)) { // Wait a bit and try again, rarely needed under Windows. Sleep(50); if (!read_text_file("o.txt", reply, true, nullptr)) return false; } return true; } std::string get_deg_to_dms(double deg) { deg = std::round(fabs(deg) * 3600.0f); int min_secs = (int)fmod(deg, 3600.0f); deg = std::floor((deg - (double)min_secs) / 3600.0f); int minutes = min_secs / 60; int secs = min_secs % 60; return string_format("%02i%:%02i:%02i", (int)deg, minutes, secs); } bool load_json_object(const char* pFilename, bool& utf8_flag, json &result_obj) { std::vector buf; if (!read_text_file(pFilename, buf, &utf8_flag)) return false; if (!buf.size()) return false; bool success = false; try { result_obj = json::parse(buf.begin(), buf.end()); success = true; } catch (json::exception& e) { fprintf(stderr, "load_json_object: Parse of file \"%s\" failed (id %i): %s", pFilename, e.id, e.what()); return false; } if (!result_obj.is_object() && !result_obj.is_array()) return false; return true; } void string_tokenize( const std::string &str, const std::string &whitespace, const std::string &break_chars, string_vec &tokens, uint_vec *pOffsets_vec) { tokens.resize(0); if (pOffsets_vec) pOffsets_vec->resize(0); std::string cur_token; uint32_t cur_ofs = 0; for (uint32_t i = 0; i < str.size(); i++) { uint8_t c = str[i]; if (whitespace.find_first_of(c) != std::string::npos) { if (cur_token.size()) { tokens.push_back(cur_token); if (pOffsets_vec) pOffsets_vec->push_back(cur_ofs); cur_token.clear(); } } else if (break_chars.find_first_of(c) != std::string::npos) { if (cur_token.size()) { tokens.push_back(cur_token); if (pOffsets_vec) pOffsets_vec->push_back(cur_ofs); cur_token.clear(); } std::string s; s.push_back(c); tokens.push_back(s); if (pOffsets_vec) pOffsets_vec->push_back(i); } else { if (!cur_token.size()) cur_ofs = i; cur_token.push_back(c); } } if (cur_token.size()) { tokens.push_back(cur_token); if (pOffsets_vec) pOffsets_vec->push_back(cur_ofs); } } const double PI = 3.141592653589793238463; double deg2rad(double deg) { return (deg * PI / 180.0); } double rad2deg(double rad) { return (rad * 180.0 / PI); } // input in degrees double geo_distance(double lat1, double lon1, double lat2, double lon2, int unit) { if ((lat1 == lat2) && (lon1 == lon2)) return 0; double theta = lon1 - lon2; double dist = cos(deg2rad(lat1)) * cos(deg2rad(lat2)) * cos(deg2rad(theta)) + sin(deg2rad(lat1)) * sin(deg2rad(lat2)); dist = acos(dist); dist = rad2deg(dist); dist = dist * 60 * 1.1515; switch (unit) { case 'M': break; case 'K': dist = dist * 1.609344; break; case 'N': dist = dist * 0.8684; break; default: assert(0); break; } return (dist); } std::string remove_bom(std::string str) { if (str.size() >= 3) { if (((uint8_t)str[0] == UTF8_BOM0) && ((uint8_t)str[1] == UTF8_BOM1) && ((uint8_t)str[2] == UTF8_BOM2)) { str.erase(0, 3); } } return str; } int get_next_utf8_code_point_len(const uint8_t* pStr) { if (pStr == nullptr || *pStr == 0) { // Return 0 if the input is null or points to a null terminator return 0; } const uint8_t firstByte = *pStr; if ((firstByte & 0x80) == 0) { // Starts with 0, ASCII character return 1; } else if ((firstByte & 0xE0) == 0xC0) { // Starts with 110 return 2; } else if ((firstByte & 0xF0) == 0xE0) { // Starts with 1110 return 3; } else if ((firstByte & 0xF8) == 0xF0) { // Starts with 11110 return 4; } else { // Invalid UTF-8 byte sequence return -1; } } void get_string_words( const std::string& str, string_vec& words, uint_vec* pOffsets_vec, const char* pAdditional_whitespace) { const uint8_t* pStr = (const uint8_t *)str.c_str(); words.resize(0); if (pOffsets_vec) pOffsets_vec->resize(0); std::string cur_token; std::string whitespace(" \t\n\r,;:.!?()[]*/\""); if (pAdditional_whitespace) whitespace += std::string(pAdditional_whitespace); int word_start_ofs = -1; uint32_t cur_ofs = 0; while ((cur_ofs < str.size()) && (pStr[cur_ofs])) { int l = get_next_utf8_code_point_len(pStr + cur_ofs); const uint8_t c = pStr[cur_ofs]; if (l <= 0) { assert(0); l = 1; } bool is_whitespace = (whitespace.find_first_of(c) != std::string::npos); if ((l == 2) && (c == 0xc2)) { // NO-BREAK SPACE if (pStr[cur_ofs + 1] == 0xa0) is_whitespace = true; } if ((l == 2) && (c == 0xCA)) { // single left quote if (pStr[cur_ofs + 1] == 0xBB) is_whitespace = true; } if ((l == 3) && (c == 0xE2) && (pStr[cur_ofs + 1] == 0x80)) { // dash if (pStr[cur_ofs + 2] == 0x93) is_whitespace = true; // dash else if (pStr[cur_ofs + 2] == 0x94) is_whitespace = true; // left quote else if (pStr[cur_ofs + 2] == 0x9C) is_whitespace = true; // right quote else if (pStr[cur_ofs + 2] == 0x9D) is_whitespace = true; // ellipsis (three dots) else if (pStr[cur_ofs + 2] == 0xA) is_whitespace = true; // ellipsis (three dots) else if (pStr[cur_ofs + 2] == 0xA6) is_whitespace = true; // long dash else if (pStr[cur_ofs + 2] == 9) is_whitespace = true; // left single quote else if (pStr[cur_ofs + 2] == 0x98) is_whitespace = true; // right single quote else if (pStr[cur_ofs + 2] == 0x99) is_whitespace = true; // right double quote else if (pStr[cur_ofs + 2] == 0x9D) is_whitespace = true; } if (is_whitespace) { if (cur_token.size()) { words.push_back(cur_token); if (pOffsets_vec) pOffsets_vec->push_back(word_start_ofs); cur_token.clear(); word_start_ofs = -1; } } else { if (word_start_ofs < 0) word_start_ofs = cur_ofs; if (l == 1) { cur_token.push_back(utolower(c)); } else { for (int i = 0; i < l; i++) cur_token.push_back(pStr[cur_ofs + i]); } } cur_ofs += l; } if (cur_token.size()) { words.push_back(cur_token); if (pOffsets_vec) pOffsets_vec->push_back(word_start_ofs); } } void get_utf8_code_point_offsets(const char* pStr, int_vec& offsets) { uint32_t cur_ofs = 0; offsets.resize(0); while (pStr[cur_ofs]) { offsets.push_back(cur_ofs); cur_ofs += std::max(1, get_next_utf8_code_point_len((const uint8_t*)pStr + cur_ofs)); } } struct char_map { const char32_t* m_pFrom; const char m_to; }; static const char_map g_char_norm_up[] = { { U"ÁĂẮẶẰẲẴǍÂẤẬẦẨẪÄǞȦǠẠȀÀẢȂĀĄÅǺḀÃǼǢȺΆ", 'A' }, { U"ḂḄḆƁƂƄ", 'B' }, { U"ĆČÇḈĈĊƇȻƆ", 'C' }, { U"ĎḐḒḊḌḎĐƉƊƋǱǲǄ", 'D' }, { U"ÉĔĚȨḜÊẾỆỀỂỄḘËĖẸȄÈẺȆĒḖḔĘẼḚÈÊËĒĔĖĘĚƐƎƏȄȆȨΈΉΕƐƐ", 'E' }, { U"ḞƑ", 'F' }, { U"ǴĞǦĢĜĠḠĜĞĠĢƓǤǦǴƔ", 'G' }, { U"ḪȞḨĤḦḢḤĤĦǶȞΗǶ", 'H' }, { U"ÍĬǏÎÏḮİỊȈÌỈȊĪĮĨḬÌÍÎÏĨĪĬĮİƗǏȈȊ", 'I' }, { U"ĴĴ", 'J' }, { U"ḰǨĶḲḴĶƘǨΚ", 'K' }, { U"ĹĽĻḼḶḸḺĹĻĽĿŁΛ", 'L' }, { U"ḾṀṂƜ", 'M' }, { U"ŃŇŅṊṄṆǸṈÑÑŃŅŇŊƝǸΝ", 'N' }, { U"ÓŎǑÔỐỘỒỔỖÖȪȮȰỌŐȌÒỎƠỚỢỜỞỠȎŌṒṐǪǬÕṌṎȬǾØÒÓÔÕÖØŌŎŐƟƠǑǪǬǾȌȎȪȬȮȰΌΟΩ", 'O' }, { U"ṔṖΠΡΦ", 'P' }, { U"ŔŘŖṘṚṜȐȒṞŔŖŘƦȐȒ", 'R' }, { U"ŚṤŠṦŞŜȘṠṢṨßŚŜŞŠƩȘΣ", 'S' }, { U"ŤŢṰȚṪṬṮŢŤŦƬƮȚΤ", 'T' }, { U"ÚŬǓÛṶÜǗǙǛǕṲỤŰȔÙỦƯỨỰỪỬỮȖŪṺŲŮŨṸṴÙÚÛÜŨŪŬŮŰŲƯǓǕǗǙǛȔȖ", 'U' }, { U"ṾṼƲ", 'V' }, { U"ẂŴẄẆẈẀŴ", 'W' }, { U"ẌẊΧΞ", 'X' }, { U"ÝŶŸẎỴỲỶȲỸÝŶŸƳȲΥΎΫ", 'Y' }, { U"ŹŽẐŻẒẔŹŻŽƵƷǮȤΖ", 'Z' }, }; static const char_map g_char_norm_lower[] = { { U"áăắặằẳẵǎâấậầẩẫäǟȧǡạȁàảȃāąåǻḁãǽǣⱥάàáâãäåāăąǎǟǡǻȁȃȧάα", 'a' }, { U"ḃḅḇɓƃƅƀƃβƀƃƅ", 'b' }, { U"ćčçḉĉċƈȼɔƈçćĉċčƈȼ", 'c' }, { U"ďḑḓḋḍḏđɖɗƌǳǳǆƌďđƌǳǆȡďđƌǳǆȡ", 'd' }, { U"éĕěȩḝêếệềểễḙëėẹȅèẻȇēḗḕęẽḛèêëēĕėęěɛǝəȅȇȩέήεɛɛèéêëēĕėęěȅȇȩε", 'e' }, { U"ḟƒ", 'f' }, { U"ǵğǧģĝġḡĝğġģɠǥǧǵɣĝğġģǧǵ", 'g' }, { U"ḫȟḩĥḧḣḥẖĥħƕƕȟƕĥħȟ", 'h' }, { U"íĭǐîïḯiịȉìỉȋīįĩḭìíîïĩīĭįiɨǐȉȋìíîïĩīĭįǐȉȋι", 'i' }, { U"ǰĵĵǰĵǰ", 'j' }, { U"ḱǩķḳḵķƙǩκƙķƙǩκ", 'k' }, { U"ĺľļḽḷḹḻĺļľŀłƚƛλƚĺļľŀłƚλƚ", 'l' }, { U"ḿṁṃɯ", 'm' }, { U"ńňņṋṅṇǹṉññńņňŋɲǹνƞñńņňŉŋƞǹη", 'n' }, { U"óŏǒôốộồổỗöȫȯȱọőȍòỏơớợờởỡȏōṓṑǫǭõṍṏȭǿøòóôõöøōŏőɵơǒǫǭǿȍȏȫȭȯȱόοòóôõöøōŏőơǒǫǭǿȍȏȫȭȯȱοσ", 'o' }, { U"ṕṗπφƥ", 'p' }, { U"ŕřŗṙṛṝȑȓṟŕŗřʀȑȓρŕŗřȑȓρ", 'r' }, { U"śṥšṧşŝșṡẛṣṩśŝşšʃșƨśŝşšșƨȿ", 's' }, { U"ťţṱțẗṫṭṯţťŧƭʈțτƫţťŧƭțτ", 't' }, { U"úŭǔûṷüǘǚǜǖṳụűȕùủưứựừửữȗūṻųůũṹṵùúûüũūŭůűųưǔǖǘǚǜȕȗưùúûüũūŭůűųưǔǖǘǚǜȕȗμ", 'u' }, { U"ṿṽʋ", 'v' }, { U"ẃŵẅẇẉẁẘŵŵω", 'w' }, { U"ẍẋχξχξ", 'x' }, { U"ýŷÿẏỵỳỷȳẙỹýŷÿƴȳυύϋƴýÿŷƴȳγψ", 'y' }, { U"źžẑżẓẕźżžƶʒǯȥζƶźżžƶƹȥζ", 'z' }, }; std::map g_upper_trans; std::map g_lower_trans; static const char* g_stop_words[] = { "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is", "it", "its", "itself", "just", "me", "more", "most", "my", "myself", "no", "nor", "not", "now", "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "re", "same", "she", "should", "so", "some", "such", "than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "you", "your", "yours", "yourself", "yourselves", "although", "also", "already", "another", "seemed", "seem", "seems" }; static const uint32_t NUM_STOP_WORDS = (uint32_t)std::size(g_stop_words); std::set g_stop_words_set; void init_norm() { g_stop_words_set.clear(); for (const auto& str : g_stop_words) g_stop_words_set.insert(str); for (uint32_t i = 0; i < std::size(g_char_norm_up); i++) { const char32_t* pFrom = g_char_norm_up[i].m_pFrom; char to_char = g_char_norm_up[i].m_to; while (*pFrom) { char32_t fc = *pFrom++; auto f = g_upper_trans.find(fc); if (f != g_upper_trans.end()) { if (f->second != to_char) { uprintf("Upper char %u 0x%x is redundant\n", fc, fc); exit(1); } } g_upper_trans[fc] = to_char; } } for (uint32_t i = 0; i < std::size(g_char_norm_lower); i++) { const char32_t* pFrom = g_char_norm_lower[i].m_pFrom; char to_char = g_char_norm_lower[i].m_to; while (*pFrom) { char32_t fc = *pFrom++; auto f = g_upper_trans.find(fc); if (f != g_upper_trans.end()) { uprintf("Lower char %u 0x%x is in the upper table\n", fc, fc); if (utolower((uint8_t)f->second) != to_char) uprintf("Conversion mismatch %u 0x%x\n", fc, fc); //exit(1); } f = g_lower_trans.find(fc); if (f != g_lower_trans.end()) { if (f->second != to_char) { uprintf("Lower char %u 0x%x is redundant\n", fc, fc); exit(1); } } g_lower_trans[fc] = to_char; } } } // Resulting characters are guaranteed to be <128 - useful for searching purposes. // Unrecognized Unicode characters are deleted. void normalize_diacritics(const char* pStr, std::string& res) { assert(g_stop_words_set.size()); res.resize(0); while (*pStr) { int l = get_next_utf8_code_point_len((const uint8_t*)pStr); const uint8_t c = *pStr; utf8_int32_t cp; char* pStr_next = utf8codepoint(pStr, &cp); assert((pStr_next - pStr) == l); if (cp < 128) { res.push_back((char)cp); pStr = pStr_next; continue; } int new_char = -1; auto u_it = g_upper_trans.find(cp); auto l_it = g_lower_trans.find(cp); if (u_it != g_upper_trans.end()) new_char = u_it->second; else if (l_it != g_lower_trans.end()) new_char = l_it->second; else { // FIXME: this is lame, it parses the utf8 directly. if ((l == 2) && (c == 0xc2)) { // NO-BREAK SPACE if ((uint8_t)pStr[1] == 0xa0) new_char = ' '; } if ((l == 2) && (c == 0xCA)) { // single left quote if ((uint8_t)pStr[1] == 0xBB) new_char = '\''; } if ((l == 3) && (c == 0xE2) && ((uint8_t)pStr[1] == 0x80)) { // dash if ((uint8_t)pStr[2] == 0x93) new_char = '-'; // dash else if ((uint8_t)pStr[2] == 0x94) new_char = '-'; // left quote else if ((uint8_t)pStr[2] == 0x9C) new_char = '"'; // right quote else if ((uint8_t)pStr[2] == 0x9D) new_char = '"'; // ellipsis (three dots) else if ((uint8_t)pStr[2] == 0xA) new_char = '.'; // ellipsis (three dots) else if ((uint8_t)pStr[2] == 0xA6) new_char = '.'; // long dash else if ((uint8_t)pStr[2] == 9) new_char = '-'; // left single quote else if ((uint8_t)pStr[2] == 0x98) new_char = '\''; // right single quote else if ((uint8_t)pStr[2] == 0x99) new_char = '\''; // right double quote else if ((uint8_t)pStr[2] == 0x9D) new_char = '"'; } } // TODO: Do something smarter? if (new_char != -1) res.push_back((char)new_char); pStr = pStr_next; } } std::string normalize_word(const std::string& str) { assert(g_stop_words_set.size()); const uint32_t MAX_STRING_SIZE = 4096; if (str.size() > MAX_STRING_SIZE) panic("String too long"); char buf[MAX_STRING_SIZE + 1]; strcpy_s(buf, sizeof(buf), str.c_str()); // Convert utf8 string to lower utf8lwr(buf); // Remove diacritics and some specials from utf8, this preserves all 1-127 chars std::string norm; norm.reserve(strlen(buf)); normalize_diacritics(buf, norm); // Remove any non-letter or non-digit characters (we assume this is a word, so whitespace gets removed too) std::string temp; temp.reserve(norm.size()); for (uint32_t i = 0; i < norm.size(); i++) { uint8_t c = norm[i]; c = utolower(c); if (uislower(c) || uisdigit(c)) temp.push_back(c); } // Stem word strcpy_s(buf, sizeof(buf), temp.c_str()); if (buf[0]) { int32_t new_len = stem(buf, 0, (int)strlen(buf) - 1); buf[new_len + 1] = '\0'; } return buf; } // Assumes word is plain ASCII lowercase bool is_stop_word(const std::string &word) { assert(g_stop_words_set.size()); return g_stop_words_set.count(word) != 0; } std::string ustrlwr(const std::string& s) { const size_t l = s.size(); std::vector temp; temp.resize(l + 1); memcpy(&temp[0], s.c_str(), l); temp[l] = '\0'; utf8lwr((char *)&temp[0]); return (char *)&temp[0]; } std::string string_replace(const std::string& str, const std::string& find, const std::string& repl) { assert(find.size()); if (!find.size() || !str.size()) return str; const uint8_t* pStr = (const uint8_t *)str.c_str(); const size_t str_size = str.size(); const uint8_t* pFind = (const uint8_t*)find.c_str(); const size_t find_size = find.size(); std::string res; res.reserve(str.size()); size_t str_ofs = 0; while (str_ofs < str.size()) { int str_char_size = get_next_utf8_code_point_len(pStr + str_ofs); if (str_char_size < 0) { assert(0); str_char_size = 1; } const size_t str_remaining = str_size - str_ofs; if ((str_remaining >= find_size) && (memcmp(pStr + str_ofs, pFind, find_size) == 0)) { res += repl; str_ofs += find_size; } else { for (int i = 0; i < str_char_size; i++) res.push_back((char)pStr[str_ofs + i]); str_ofs += str_char_size; } } return res; } bool does_file_exist(const char* pFilename) { FILE* pFile = ufopen(pFilename, "rb"); if (!pFile) return false; fclose(pFile); return true; }