updated files

2025-02-18 13:54:11 -05:00 · 2023-10-05 14:07:39 -04:00 · 2023-10-05 14:07:39 -04:00 · a70cef4dde
commit a70cef4dde
parent 43d0d882ef
12 changed files with 4072 additions and 834 deletions
--- a/converters.cpp
+++ b/converters.cpp
--- a/converters.h
+++ b/converters.h
@ -5,11 +5,14 @@

 void converters_init();

-bool convert_magnonia(const char* pSrc_filename, const char* pDst_filename, const char* pSource_override = nullptr, const char* pRef_override = nullptr);
+bool convert_magonia(const char* pSrc_filename, const char* pDst_filename, const char* pSource_override = nullptr, const char* pRef_override = nullptr, uint32_t TOTAL_COLS = 15, const char *pType_override = nullptr, bool parens_flag = true, uint32_t first_rec_index = 1);
+bool convert_dolan(const char* pSrc_filename, const char* pDst_filename, const char* pSource, const char* pType, const char* pRef);
 bool convert_bluebook_unknowns();
 bool convert_hall();
 bool convert_eberhart(unordered_string_set& unique_urls);
 bool convert_johnson();
 bool convert_nicap(unordered_string_set& unique_urls);
 bool convert_nuk();
-bool convert_anon();
+bool convert_anon();
+bool convert_rr0();
+bool convert_overmeire();
--- a/markdown_proc.cpp
+++ b/markdown_proc.cpp
@ -2,6 +2,435 @@
 // markdown_proc.cpp
 #include "markdown_proc.h"

+struct markdown
+{
+    enum
+    {
+        cCodeSig = 0xFE,
+
+        cCodeLink = 1,
+        cCodeEmphasis,
+        cCodeText,
+        cCodeParagraph,
+        cCodeLinebreak,
+        cCodeHTML
+    };
+
+    static void bufappend(struct buf* out, struct buf* in)
+    {
+        assert(in != out);
+
+        if (in && in->size)
+            bufput(out, in->data, in->size);
+    }
+
+    static void writelen(struct buf* ob, uint32_t size)
+    {
+        bufputc(ob, (uint8_t)(size & 0xFF));
+        bufputc(ob, (uint8_t)((size >> 8) & 0xFF));
+        bufputc(ob, (uint8_t)((size >> 16) & 0xFF));
+        bufputc(ob, (uint8_t)((size >> 24) & 0xFF));
+    }
+
+    static std::string get_string(const std::string& buf, uint32_t& cur_ofs, uint32_t text_size)
+    {
+        std::string text;
+        if (cur_ofs + text_size > buf.size())
+            panic("Buffer too small");
+
+        text.append(buf.c_str() + cur_ofs, text_size);
+        cur_ofs += text_size;
+
+        return text;
+    }
+
+    static uint32_t get_len32(const std::string& buf, uint32_t& ofs)
+    {
+        if ((ofs + 4) > buf.size())
+            panic("Buffer too small");
+
+        uint32_t l = (uint8_t)buf[ofs] |
+            (((uint8_t)buf[ofs + 1]) << 8) |
+            (((uint8_t)buf[ofs + 2]) << 16) |
+            (((uint8_t)buf[ofs + 3]) << 24);
+
+        ofs += 4;
+
+        return l;
+    }
+
+    static void prolog(struct buf* ob, void* opaque)
+    {
+    }
+
+    static void epilog(struct buf* ob, void* opaque)
+    {
+    }
+
+    /* block level callbacks - NULL skips the block */
+    static void blockcode(struct buf* ob, struct buf* text, void* opaque)
+    {
+#if 0
+        bufprintf(ob, "blockcode: \"%.*s\" ", (int)text->size, text->data);
+#endif
+        panic("unsupported markdown feature");
+    }
+
+    static void blockquote(struct buf* ob, struct buf* text, void* opaque)
+    {
+#if 0
+        bufprintf(ob, "blockquote: \"%.*s\" ", (int)text->size, text->data);
+#endif
+        // TODO: unsupported block quotes (here for when we're converting to plain text)
+        //panic("unsupported markdown feature");
+        if (opaque)
+            *(bool*)opaque = true;
+
+        if (!text || !text->size)
+            return;
+
+        bufputc(ob, (uint8_t)cCodeSig);
+        bufputc(ob, (uint8_t)cCodeParagraph);
+        writelen(ob, (uint32_t)text->size);
+        bufappend(ob, text);
+    }
+
+    static void blockhtml(struct buf* ob, struct buf* text, void* opaque)
+    {
+#if 0
+        bufprintf(ob, "blockhtml: \"%.*s\" ", (int)text->size, text->data);
+#endif
+        // TODO: Not fully supported - just dropping it
+        //panic("unsupported markdown feature");
+
+        if (opaque)
+            *(bool*)opaque = true;
+    }
+
+    static void header(struct buf* ob, struct buf* text, int level, void* opaque)
+    {
+#if 0
+        bufprintf(ob, "header: %i \"%.*s\" ", level, (int)text->size, text->data);
+#endif
+        // TODO: Not fully supported
+        //panic("unsupported markdown feature");
+        if (opaque)
+            *(bool*)opaque = true;
+
+        bufputc(ob, (uint8_t)cCodeSig);
+        bufputc(ob, (uint8_t)cCodeParagraph);
+        writelen(ob, (uint32_t)text->size);
+        bufappend(ob, text);
+    }
+
+    static void hrule(struct buf* ob, void* opaque)
+    {
+        // TODO
+        //panic("unsupported markdown feature");
+        
+        if (opaque)
+            *(bool*)opaque = true;
+    }
+
+    static void list(struct buf* ob, struct buf* text, int flags, void* opaque)
+    {
+        // TODO: not fully supporting lists (here for when we're converting to plain text)
+        //panic("unsupported markdown feature");
+        if (opaque)
+            *(bool*)opaque = true;
+
+        if (!text || !text->size)
+            return;
+
+        bufputc(ob, (uint8_t)cCodeSig);
+        bufputc(ob, (uint8_t)cCodeParagraph);
+        writelen(ob, (uint32_t)text->size);
+        bufappend(ob, text);
+    }
+
+    static void listitem(struct buf* ob, struct buf* text, int flags, void* opaque)
+    {
+        // TODO: not fully supporting lists (here for when we're converting to plain text)
+        //panic("unsupported markdown feature");
+        if (opaque)
+            *(bool*)opaque = true;
+
+        if (!text || !text->size)
+            return;
+
+        bufputc(ob, (uint8_t)cCodeSig);
+        bufputc(ob, (uint8_t)cCodeParagraph);
+        writelen(ob, (uint32_t)text->size);
+        bufappend(ob, text);
+    }
+
+    static void paragraph(struct buf* ob, struct buf* text, void* opaque)
+    {
+#if 0
+        bufprintf(ob, "paragraph: \"%.*s\" ", (int)text->size, text->data);
+#endif
+        if (!text || !text->size)
+            return;
+
+        bufputc(ob, (uint8_t)cCodeSig);
+        bufputc(ob, (uint8_t)cCodeParagraph);
+        writelen(ob, (uint32_t)text->size);
+        bufappend(ob, text);
+    }
+
+    static void table(struct buf* ob, struct buf* head_row, struct buf* rows, void* opaque)
+    {
+#if 0
+        bufprintf(ob, "table: \"%.*s\" \"%.*s\" ", (int)head_row->size, head_row->data, (int)rows->size, rows->data);
+#endif
+        //panic("unsupported markdown feature");
+
+        // TODO: not fully supported, just for plaintext conversion
+        if (opaque)
+            *(bool*)opaque = true;
+    }
+
+    static void table_cell(struct buf* ob, struct buf* text, int flags, void* opaque)
+    {
+#if 0
+        bufprintf(ob, "table_cell: \"%.*s\" %i ", (int)text->size, text->data, flags);
+#endif
+        //panic("unsupported markdown feature");
+        if (opaque)
+            *(bool*)opaque = true;
+
+        // TODO: not fully supported, just for plaintext conversion
+        if (!text || !text->size)
+            return;
+
+        bufputc(ob, (uint8_t)cCodeSig);
+        bufputc(ob, (uint8_t)cCodeParagraph);
+        writelen(ob, (uint32_t)text->size);
+        bufappend(ob, text);
+    }
+
+    static void table_row(struct buf* ob, struct buf* cells, int flags, void* opaque)
+    {
+#if 0
+        bufprintf(ob, "table_row: \"%.*s\" %i ", (int)cells->size, cells->data, flags);
+#endif
+        //panic("unsupported markdown feature");
+        // TODO: not fully supported, just for plaintext conversion
+
+        if (opaque)
+            *(bool*)opaque = true;
+    }
+
+    static int autolink(struct buf* ob, struct buf* link, enum mkd_autolink type, void* opaque)
+    {
+#if 0
+        bufprintf(ob, "autolink: %u \"%.*s\" ", type, (int)link->size, link->data);
+#endif
+        panic("unsupported markdown feature");
+        return 1;
+    }
+
+    static int codespan(struct buf* ob, struct buf* text, void* opaque)
+    {
+#if 0
+        bufprintf(ob, "codespan: \"%.*s\" ", (int)text->size, text->data);
+#endif
+        //panic("unsupported markdown feature");
+        if (opaque)
+            *(bool*)opaque = true;
+
+        bufputc(ob, (uint8_t)cCodeSig);
+        bufputc(ob, (uint8_t)cCodeText);
+        writelen(ob, (uint32_t)text->size);
+        bufappend(ob, text);
+
+        return 1;
+    }
+
+    static int double_emphasis(struct buf* ob, struct buf* text, char c, void* opaque)
+    {
+#if 0
+        bufprintf(ob, "double_emphasis: %u ('%c') [%.*s] ", c, c, (int)text->size, text->data);
+#endif
+        if (!text || !text->size)
+            return 1;
+
+        bufputc(ob, (uint8_t)cCodeSig);
+        bufputc(ob, (uint8_t)cCodeEmphasis);
+        bufputc(ob, c);
+        bufputc(ob, 2);
+        writelen(ob, (uint32_t)text->size);
+        bufappend(ob, text);
+
+        return 1;
+    }
+
+    static int emphasis(struct buf* ob, struct buf* text, char c, void* opaque)
+    {
+#if 0
+        bufprintf(ob, "emphasis: %u ('%c') [%.*s] ", c, c, (int)text->size, text->data);
+#endif
+
+        if (!text || !text->size)
+            return 1;
+
+        bufputc(ob, (uint8_t)cCodeSig);
+        bufputc(ob, (uint8_t)cCodeEmphasis);
+        bufputc(ob, c);
+        bufputc(ob, 1);
+        writelen(ob, (uint32_t)text->size);
+        bufappend(ob, text);
+
+        return 1;
+    }
+
+    static int image(struct buf* ob, struct buf* link, struct buf* title, struct buf* alt, void* opaque)
+    {
+#if 0
+        bufprintf(ob, "image: \"%.*s\" \"%.*s\" \"%.*s\" ",
+            (int)link->size, link->data,
+            (int)title->size, title->data,
+            (int)alt->size, alt->data);
+#endif
+        //panic("unsupported markdown feature");
+        if (opaque)
+            *(bool*)opaque = true;
+
+        if (alt)
+        {
+            bufputc(ob, (uint8_t)cCodeSig);
+            bufputc(ob, (uint8_t)cCodeText);
+            writelen(ob, (uint32_t)alt->size);
+            bufappend(ob, alt);
+        }
+
+        return 1;
+    }
+
+    static int linebreak(struct buf* ob, void* opaque)
+    {
+#if 0
+        bufprintf(ob, "linebreak ");
+#endif
+
+        bufputc(ob, (uint8_t)cCodeSig);
+        bufputc(ob, (uint8_t)cCodeLinebreak);
+
+        return 1;
+    }
+
+    static int link(struct buf* ob, struct buf* link, struct buf* title, struct buf* content, void* opaque)
+    {
+#if 0
+        printf("link: {%.*s} {%.*s} {%.*s}\n",
+            link ? (int)link->size : 0,
+            link ? link->data : nullptr,
+            title ? (int)title->size : 0,
+            title ? title->data : nullptr,
+            content ? (int)content->size : 0,
+            content ? content->data : nullptr);
+#endif
+        bufputc(ob, (uint8_t)cCodeSig);
+        bufputc(ob, (uint8_t)cCodeLink);
+        writelen(ob, (uint32_t)link->size);
+        writelen(ob, (uint32_t)content->size);
+
+        bufappend(ob, link);
+        bufappend(ob, content);
+
+        return 1;
+    }
+
+    static int raw_html_tag(struct buf* ob, struct buf* tag, void* opaque)
+    {
+        //bufprintf(ob, "raw_html_tag: \"%.*s\" ", (int)tag->size, tag->data);
+
+        if (!tag || !tag->size)
+            return 1;
+
+        bufputc(ob, (uint8_t)cCodeSig);
+        bufputc(ob, (uint8_t)cCodeHTML);
+        writelen(ob, (uint32_t)tag->size);
+        bufappend(ob, tag);
+
+        return 1;
+    }
+
+    static int triple_emphasis(struct buf* ob, struct buf* text, char c, void* opaque)
+    {
+        //bufprintf(ob, "triple_emphasis: %u ('%c') [%.*s] ", c, c, (int)text->size, text->data);
+
+        if (!text || !text->size)
+            return 1;
+
+        bufputc(ob, (uint8_t)cCodeSig);
+        bufputc(ob, (uint8_t)cCodeEmphasis);
+        bufputc(ob, c);
+        bufputc(ob, 3);
+        writelen(ob, (uint32_t)text->size);
+        bufappend(ob, text);
+
+        return 1;
+    }
+
+    static void normal_text(struct buf* ob, struct buf* text, void* opaque)
+    {
+        if (!text || !text->size)
+            return;
+
+        bufputc(ob, (uint8_t)cCodeSig);
+        bufputc(ob, (uint8_t)cCodeText);
+        writelen(ob, (uint32_t)text->size);
+        for (uint32_t i = 0; i < text->size; i++)
+        {
+            uint8_t c = text->data[i];
+            if (c == '\n')
+                bufputc(ob, ' ');
+            else if (c != 1)
+            {
+                assert(c >= 32 || c == '\t');
+                bufputc(ob, c);
+            }
+        }
+    }
+};
+
+static struct mkd_renderer g_mkd_parse =
+{
+    markdown::prolog,
+    markdown::epilog,
+
+    markdown::blockcode,
+    markdown::blockquote,
+    markdown::blockhtml,
+    markdown::header,
+    markdown::hrule,
+    markdown::list,
+    markdown::listitem,
+    markdown::paragraph,
+    markdown::table,
+    markdown::table_cell,
+    markdown::table_row,
+
+    markdown::autolink,
+    markdown::codespan,
+    markdown::double_emphasis,
+    markdown::emphasis,
+    markdown::image,
+    markdown::linebreak,
+    markdown::link,
+    markdown::raw_html_tag,
+    markdown::triple_emphasis,
+
+    //markdown::entity,
+    nullptr,
+    markdown::normal_text,
+
+    64,
+    "*_",
+    nullptr
+};
+
 static bool markdown_should_escape(int c)
 {
    switch (c)
@ -48,12 +477,14 @@ static std::string escape_markdown(const std::string& str)
    return out;
 }

-markdown_text_processor::markdown_text_processor()
+markdown_text_processor::markdown_text_processor() : 
+    m_used_unsupported_feature(false)
 {
 }

 void markdown_text_processor::clear()
 {
+    m_used_unsupported_feature = false;
    m_text.clear();
    m_details.clear();
    m_links.clear();
@ -126,7 +557,10 @@ void markdown_text_processor::init_from_markdown(const char* pText)
    bufputs(pIn, pText);

    struct buf* pOut = bufnew(4096);
-    markdown(pOut, pIn, &mkd_parse);
+    
+    m_used_unsupported_feature = false;
+    g_mkd_parse.opaque = &m_used_unsupported_feature;
+    markdown(pOut, pIn, &g_mkd_parse);

    std::string buf;
    buf.append((char*)pOut->data, pOut->size);
@ -243,6 +677,9 @@ void markdown_text_processor::convert_to_plain(std::string& out, bool trim_end)

 void markdown_text_processor::convert_to_markdown(std::string& out, bool trim_end) const
 {
+    if (m_used_unsupported_feature)
+        printf("markdown_text_processor::convert_to_markdown: Warning, one or more Markdown features were used in this text and won't be losslessly converted.\n");
+
    int emphasis = 0, emphasis_amount = 0;
    int cur_link_index = -1;

@ -606,3 +1043,66 @@ void markdown_text_processor::handle_emphasis(std::string& out, uint32_t text_of
        //    out += m_details[text_ofs].m_html[i];
    }
 }
+
+#if 0
+const char* pText =
+u8R"(
+
+<ul>test</ul>
+
+_text1_  
+**text2**  
+**_text3_**  
+
+![alt text](https://github.com/n48.png "Logo Title")
+
+# Heading 1
+## Heading 2
+### Heading 3
+
+1. XXXXX  
+  1. Item 1
+  2. Item 2
+2. YYYYY
+3. ZZZZZ
+
+| Tables        | Are           | Cool  |
+| ------------- |:-------------:| -----:|
+| col 3 is      | right-aligned | $1600 |
+| col 2 is      | centered      |   $12 |
+| zebra stripes | are neat      |    $1 |
+
+* [blahblah](www.blah1.com)
+* [blahblah2](www.blah2.com)
+
+`
+this is code 1
+this is code 2
+`
+
+```
+this is code 3
+this is code 4
+```
+
+> blockquote 1  
+> blockquote 2
+
+---
+
+* AAA
+* BBB
+  * ZZZZ1
+  * ZZZZ2
+* CCC)";
+
+markdown_text_processor tp;
+tp.init_from_markdown(pText);
+
+std::string desc;
+tp.convert_to_plain(desc, true);
+
+uprintf("%s\n", desc.c_str());
+
+return 0;
+#endif
--- a/markdown_proc.h
+++ b/markdown_proc.h
@ -6,386 +6,6 @@

 #include "libsoldout/markdown.h"

-struct markdown
-{
-    enum
-    {
-        cCodeSig = 0xFE,
-
-        cCodeLink = 1,
-        cCodeEmphasis,
-        cCodeText,
-        cCodeParagraph,
-        cCodeLinebreak,
-        cCodeHTML
-    };
-
-    static void bufappend(struct buf* out, struct buf* in)
-    {
-        assert(in != out);
-
-        if (in && in->size)
-            bufput(out, in->data, in->size);
-    }
-
-    static void writelen(struct buf* ob, uint32_t size)
-    {
-        bufputc(ob, (uint8_t)(size & 0xFF));
-        bufputc(ob, (uint8_t)((size >> 8) & 0xFF));
-        bufputc(ob, (uint8_t)((size >> 16) & 0xFF));
-        bufputc(ob, (uint8_t)((size >> 24) & 0xFF));
-    }
-
-    static std::string get_string(const std::string& buf, uint32_t& cur_ofs, uint32_t text_size)
-    {
-        std::string text;
-        if (cur_ofs + text_size > buf.size())
-            panic("Buffer too small");
-
-        text.append(buf.c_str() + cur_ofs, text_size);
-        cur_ofs += text_size;
-
-        return text;
-    }
-
-    static uint32_t get_len32(const std::string& buf, uint32_t& ofs)
-    {
-        if ((ofs + 4) > buf.size())
-            panic("Buffer too small");
-
-        uint32_t l = (uint8_t)buf[ofs] |
-            (((uint8_t)buf[ofs + 1]) << 8) |
-            (((uint8_t)buf[ofs + 2]) << 16) |
-            (((uint8_t)buf[ofs + 3]) << 24);
-
-        ofs += 4;
-
-        return l;
-    }
-
-    static void prolog(struct buf* ob, void* opaque)
-    {
-    }
-
-    static void epilog(struct buf* ob, void* opaque)
-    {
-    }
-
-    /* block level callbacks - NULL skips the block */
-    static void blockcode(struct buf* ob, struct buf* text, void* opaque)
-    {
-#if 0
-        bufprintf(ob, "blockcode: \"%.*s\" ", (int)text->size, text->data);
-#endif
-        panic("unsupported markdown feature");
-    }
-
-    static void blockquote(struct buf* ob, struct buf* text, void* opaque)
-    {
-#if 0
-        bufprintf(ob, "blockquote: \"%.*s\" ", (int)text->size, text->data);
-#endif
-        // TODO: unsupported block quotes (here for when we're converting to plain text)
-        //panic("unsupported markdown feature");
-        if (!text || !text->size)
-            return;
-
-        bufputc(ob, (uint8_t)cCodeSig);
-        bufputc(ob, (uint8_t)cCodeParagraph);
-        writelen(ob, (uint32_t)text->size);
-        bufappend(ob, text);
-    }
-
-    static void blockhtml(struct buf* ob, struct buf* text, void* opaque)
-    {
-#if 0
-        bufprintf(ob, "blockhtml: \"%.*s\" ", (int)text->size, text->data);
-#endif
-        panic("unsupported markdown feature");
-    }
-
-    static void header(struct buf* ob, struct buf* text, int level, void* opaque)
-    {
-#if 0
-        bufprintf(ob, "header: %i \"%.*s\" ", level, (int)text->size, text->data);
-#endif
-        panic("unsupported markdown feature");
-    }
-
-    static void hrule(struct buf* ob, void* opaque)
-    {
-        panic("unsupported markdown feature");
-    }
-
-    static void list(struct buf* ob, struct buf* text, int flags, void* opaque)
-    {
-        // TODO: not fully supporting lists (here for when we're converting to plain text)
-        //panic("unsupported markdown feature");
-
-        if (!text || !text->size)
-            return;
-
-        bufputc(ob, (uint8_t)cCodeSig);
-        bufputc(ob, (uint8_t)cCodeParagraph);
-        writelen(ob, (uint32_t)text->size);
-        bufappend(ob, text);
-    }
-
-    static void listitem(struct buf* ob, struct buf* text, int flags, void* opaque)
-    {
-        // TODO: not fully supporting lists (here for when we're converting to plain text)
-        //panic("unsupported markdown feature");
-
-        if (!text || !text->size)
-            return;
-
-        bufputc(ob, (uint8_t)cCodeSig);
-        bufputc(ob, (uint8_t)cCodeParagraph);
-        writelen(ob, (uint32_t)text->size);
-        bufappend(ob, text);
-    }
-
-    static void paragraph(struct buf* ob, struct buf* text, void* opaque)
-    {
-#if 0
-        bufprintf(ob, "paragraph: \"%.*s\" ", (int)text->size, text->data);
-#endif
-        if (!text || !text->size)
-            return;
-
-        bufputc(ob, (uint8_t)cCodeSig);
-        bufputc(ob, (uint8_t)cCodeParagraph);
-        writelen(ob, (uint32_t)text->size);
-        bufappend(ob, text);
-    }
-
-    static void table(struct buf* ob, struct buf* head_row, struct buf* rows, void* opaque)
-    {
-#if 0
-        bufprintf(ob, "table: \"%.*s\" \"%.*s\" ", (int)head_row->size, head_row->data, (int)rows->size, rows->data);
-#endif
-        //panic("unsupported markdown feature");
-
-        // TODO: not fully supported, just for plaintext conversion
-    }
-
-    static void table_cell(struct buf* ob, struct buf* text, int flags, void* opaque)
-    {
-#if 0
-        bufprintf(ob, "table_cell: \"%.*s\" %i ", (int)text->size, text->data, flags);
-#endif
-        //panic("unsupported markdown feature");
-
-        // TODO: not fully supported, just for plaintext conversion
-        if (!text || !text->size)
-            return;
-
-        bufputc(ob, (uint8_t)cCodeSig);
-        bufputc(ob, (uint8_t)cCodeParagraph);
-        writelen(ob, (uint32_t)text->size);
-        bufappend(ob, text);
-    }
-
-    static void table_row(struct buf* ob, struct buf* cells, int flags, void* opaque)
-    {
-#if 0
-        bufprintf(ob, "table_row: \"%.*s\" %i ", (int)cells->size, cells->data, flags);
-#endif
-        //panic("unsupported markdown feature");
-        // TODO: not fully supported, just for plaintext conversion
-    }
-
-    static int autolink(struct buf* ob, struct buf* link, enum mkd_autolink type, void* opaque)
-    {
-#if 0
-        bufprintf(ob, "autolink: %u \"%.*s\" ", type, (int)link->size, link->data);
-#endif
-        panic("unsupported markdown feature");
-        return 1;
-    }
-
-    static int codespan(struct buf* ob, struct buf* text, void* opaque)
-    {
-#if 0
-        bufprintf(ob, "codespan: \"%.*s\" ", (int)text->size, text->data);
-#endif
-        panic("unsupported markdown feature");
-        return 1;
-    }
-
-    static int double_emphasis(struct buf* ob, struct buf* text, char c, void* opaque)
-    {
-#if 0
-        bufprintf(ob, "double_emphasis: %u ('%c') [%.*s] ", c, c, (int)text->size, text->data);
-#endif
-        if (!text || !text->size)
-            return 1;
-
-        bufputc(ob, (uint8_t)cCodeSig);
-        bufputc(ob, (uint8_t)cCodeEmphasis);
-        bufputc(ob, c);
-        bufputc(ob, 2);
-        writelen(ob, (uint32_t)text->size);
-        bufappend(ob, text);
-
-        return 1;
-    }
-
-    static int emphasis(struct buf* ob, struct buf* text, char c, void* opaque)
-    {
-#if 0
-        bufprintf(ob, "emphasis: %u ('%c') [%.*s] ", c, c, (int)text->size, text->data);
-#endif
-
-        if (!text || !text->size)
-            return 1;
-
-        bufputc(ob, (uint8_t)cCodeSig);
-        bufputc(ob, (uint8_t)cCodeEmphasis);
-        bufputc(ob, c);
-        bufputc(ob, 1);
-        writelen(ob, (uint32_t)text->size);
-        bufappend(ob, text);
-
-        return 1;
-    }
-
-    static int image(struct buf* ob, struct buf* link, struct buf* title, struct buf* alt, void* opaque)
-    {
-#if 0
-        bufprintf(ob, "image: \"%.*s\" \"%.*s\" \"%.*s\" ",
-            (int)link->size, link->data,
-            (int)title->size, title->data,
-            (int)alt->size, alt->data);
-#endif
-        panic("unsupported markdown feature");
-        return 1;
-    }
-
-    static int linebreak(struct buf* ob, void* opaque)
-    {
-#if 0
-        bufprintf(ob, "linebreak ");
-#endif
-
-        bufputc(ob, (uint8_t)cCodeSig);
-        bufputc(ob, (uint8_t)cCodeLinebreak);
-
-        return 1;
-    }
-
-    static int link(struct buf* ob, struct buf* link, struct buf* title, struct buf* content, void* opaque)
-    {
-#if 0
-        printf("link: {%.*s} {%.*s} {%.*s}\n",
-            link ? (int)link->size : 0,
-            link ? link->data : nullptr,
-            title ? (int)title->size : 0,
-            title ? title->data : nullptr,
-            content ? (int)content->size : 0,
-            content ? content->data : nullptr);
-#endif
-        bufputc(ob, (uint8_t)cCodeSig);
-        bufputc(ob, (uint8_t)cCodeLink);
-        writelen(ob, (uint32_t)link->size);
-        writelen(ob, (uint32_t)content->size);
-
-        bufappend(ob, link);
-        bufappend(ob, content);
-
-        return 1;
-    }
-
-    static int raw_html_tag(struct buf* ob, struct buf* tag, void* opaque)
-    {
-        //bufprintf(ob, "raw_html_tag: \"%.*s\" ", (int)tag->size, tag->data);
-
-        if (!tag || !tag->size)
-            return 1;
-
-        bufputc(ob, (uint8_t)cCodeSig);
-        bufputc(ob, (uint8_t)cCodeHTML);
-        writelen(ob, (uint32_t)tag->size);
-        bufappend(ob, tag);
-
-        return 1;
-    }
-
-    static int triple_emphasis(struct buf* ob, struct buf* text, char c, void* opaque)
-    {
-        //bufprintf(ob, "triple_emphasis: %u ('%c') [%.*s] ", c, c, (int)text->size, text->data);
-
-        if (!text || !text->size)
-            return 1;
-
-        bufputc(ob, (uint8_t)cCodeSig);
-        bufputc(ob, (uint8_t)cCodeEmphasis);
-        bufputc(ob, c);
-        bufputc(ob, 3);
-        writelen(ob, (uint32_t)text->size);
-        bufappend(ob, text);
-
-        return 1;
-    }
-
-    static void normal_text(struct buf* ob, struct buf* text, void* opaque)
-    {
-        if (!text || !text->size)
-            return;
-
-        bufputc(ob, (uint8_t)cCodeSig);
-        bufputc(ob, (uint8_t)cCodeText);
-        writelen(ob, (uint32_t)text->size);
-        for (uint32_t i = 0; i < text->size; i++)
-        {
-            uint8_t c = text->data[i];
-            if (c == '\n')
-                bufputc(ob, ' ');
-            else if (c != 1)
-            {
-                assert(c >= 32 || c == '\t');
-                bufputc(ob, c);
-            }
-        }
-    }
-};
-
-const struct mkd_renderer mkd_parse =
-{
-    markdown::prolog,
-    markdown::epilog,
-
-    markdown::blockcode,
-    markdown::blockquote,
-    markdown::blockhtml,
-    markdown::header,
-    markdown::hrule,
-    markdown::list,
-    markdown::listitem,
-    markdown::paragraph,
-    markdown::table,
-    markdown::table_cell,
-    markdown::table_row,
-
-    markdown::autolink,
-    markdown::codespan,
-    markdown::double_emphasis,
-    markdown::emphasis,
-    markdown::image,
-    markdown::linebreak,
-    markdown::link,
-    markdown::raw_html_tag,
-    markdown::triple_emphasis,
-
-    //markdown::entity,
-    nullptr,
-    markdown::normal_text,
-
-    64,
-    "*_",
-    nullptr
-};
-
 class markdown_text_processor
 {
 public:
@ -406,17 +26,23 @@ public:
    std::string m_text;
    std::vector<detail> m_details;
    string_vec m_links;
+    bool m_used_unsupported_feature;

    markdown_text_processor();

    void clear();

    void fix_redirect_urls();
+    
+    // Note \n escapes will escape "n", not result in a CR.
    void init_from_markdown(const char* pText);
    bool split_in_half(uint32_t ofs, markdown_text_processor& a, markdown_text_processor& b) const;
    uint32_t count_char_in_text(uint8_t c) const;
    bool split_last_parens(markdown_text_processor& a, markdown_text_processor& b) const;
+    
    void convert_to_plain(std::string& out, bool trim_end) const;
+    
+    // Warning: Only a few core features are supported. If after parsing m_used_unsupported_feature is true, then this will not be lossless.
    void convert_to_markdown(std::string& out, bool trim_end) const;

 private:
--- a/ufojson.cpp
+++ b/ufojson.cpp
--- a/ufojson.vcxproj
+++ b/ufojson.vcxproj
@ -78,8 +78,10 @@
    <ClCompile Include="libsoldout\markdown.c" />
    <ClCompile Include="libsoldout\renderers.c" />
    <ClCompile Include="markdown_proc.cpp" />
+    <ClCompile Include="stem.c" />
    <ClCompile Include="udb.cpp" />
    <ClInclude Include="converters.h" />
+    <ClInclude Include="stem.h" />
    <ClInclude Include="udb_tables.h" />
    <ClCompile Include="ufojson.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Level4</WarningLevel>
@ -97,6 +99,7 @@
    <ClInclude Include="resource.h" />
    <ClInclude Include="udb.h" />
    <ClInclude Include="ufojson_core.h" />
+    <ClInclude Include="utf8.h" />
    <ClInclude Include="utils.h" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/ufojson.vcxproj.filters
+++ b/ufojson.vcxproj.filters
@ -48,6 +48,9 @@
    <ClCompile Include="converters.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="stem.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="libsoldout\array.h">
@ -83,5 +86,11 @@
    <ClInclude Include="converters.h">
      <Filter>Source Files</Filter>
    </ClInclude>
+    <ClInclude Include="utf8.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="stem.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
  </ItemGroup>
 </Project>
--- a/ufojson.vcxproj.user
+++ b/ufojson.vcxproj.user
@ -3,13 +3,11 @@
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LocalDebuggerWorkingDirectory>bin</LocalDebuggerWorkingDirectory>
    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
-    <LocalDebuggerCommandArguments>
-    </LocalDebuggerCommandArguments>
+    <LocalDebuggerCommandArguments>-convert</LocalDebuggerCommandArguments>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LocalDebuggerWorkingDirectory>bin</LocalDebuggerWorkingDirectory>
    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
-    <LocalDebuggerCommandArguments>
-    </LocalDebuggerCommandArguments>
+    <LocalDebuggerCommandArguments>-convert</LocalDebuggerCommandArguments>
  </PropertyGroup>
 </Project>
--- a/ufojson_core.cpp
+++ b/ufojson_core.cpp
--- a/ufojson_core.h
+++ b/ufojson_core.h
@ -6,6 +6,7 @@
 // Note that May ends in a period.
 extern const char* g_months[12];
 extern const char* g_full_months[12];
+extern const char* g_day_of_week[7];

 const uint32_t NUM_DATE_PREFIX_STRINGS = 24;
 extern const char* g_date_prefix_strings[NUM_DATE_PREFIX_STRINGS];
@ -47,7 +48,9 @@ enum date_prefix_t
 };

 bool is_season(date_prefix_t prefix);
-int determine_month(const std::string& date);
+int determine_month(const std::string& date, bool begins_with = true);
+int determine_prefix(const std::string& date, bool begins_with = true);
+int determine_day_of_week(const std::string& date, bool begins_with = true);

 struct event_date
 {
@ -108,19 +111,19 @@ private:
 struct timeline_event
 {
    std::string m_date_str;
-    std::string m_time_str; // military 
+    std::string m_time_str; // military, but currently it's in any format (not parsed yet)
        
    std::string m_alt_date_str;
-
    std::string m_end_date_str;

    event_date m_begin_date;
    event_date m_end_date;
    event_date m_alt_date;

-    std::string m_desc;
+    std::string m_desc;         // Markdown
    string_vec m_type;
-    string_vec m_refs;
+    string_vec m_refs;          // Markdown
+        
    string_vec m_locations;
    string_vec m_attributes;
    string_vec m_see_also;
@ -138,11 +141,13 @@ struct timeline_event
    std::string m_source;

    std::vector<string_pair> m_key_value_data;
+
+    std::string m_plain_desc;   // Computed, ignored for comparison purposes, not deserialized from JSON
+    string_vec m_plain_refs;    // Computed, ignored for comparison purposes, not deserialized from JSON
+    std::string m_search_words;  // Computed, ignored for comparison purposes, not deserialized from JSON
            
    bool operator==(const timeline_event& rhs) const;
-
    bool operator!=(const timeline_event& rhs) const;
-
    bool operator< (const timeline_event& rhs) const;

    void print(FILE* pFile) const;
@ -156,6 +161,15 @@ struct timeline_event

 typedef std::vector<timeline_event> timeline_event_vec;

+bool date_filter_single(
+    int start_month, int start_day, int start_year,
+    const event_date& evt_b, const event_date& evt_e);
+
+bool date_filter_range(
+    int start_month, int start_day, int start_year,
+    int end_month, int end_day, int end_year,
+    const event_date& evt_b, const event_date& evt_e);
+
 const uint32_t NUM_KWIC_FILE_STRINGS = 28;

 static inline std::string get_kwic_index_name(uint32_t i)
@ -215,6 +229,8 @@ public:
        }
    }

+    void create_plaintext();
+
    bool write_file(const char* pFilename, bool utf8_bom = true)
    {
        json j;
--- a/utils.cpp
+++ b/utils.cpp
@ -1,6 +1,8 @@
-// utils.cpp
+// utils.cpp
 // Copyright (C) 2023 Richard Geldreich, Jr.
 #include "utils.h"
+#include "utf8.h"
+#include "stem.h"

 std::string combine_strings(std::string a, const std::string& b)
 {
@ -265,7 +267,7 @@ int string_ifind_first(const std::string& str, const char* pPhrase)
    for (size_t ofs = 0; ofs <= end_ofs; ofs++)
    {
        assert(ofs + phrase_size <= str_size);
-        if (_stricmp(str.c_str() + ofs, pPhrase) == 0)
+        if (_strnicmp(str.c_str() + ofs, pPhrase, phrase_size) == 0)
            return (int)ofs;
    }
    
@ -552,7 +554,7 @@ bool read_text_file(const char* pFilename, std::vector<uint8_t>& buf, bool *pUTF
    return true;
 }

-bool write_text_file(const char* pFilename, string_vec& lines, bool utf8_bom)
+bool write_text_file(const char* pFilename, const string_vec& lines, bool utf8_bom)
 {
    FILE* pFile = ufopen(pFilename, "wb");
    if (!pFile)
@ -984,6 +986,43 @@ bool invoke_openai(const std::string& prompt, std::string& reply)
    return true;
 }

+bool invoke_openai(const string_vec &prompt, string_vec &reply)
+{
+    reply.clear();
+
+    if (!write_text_file("i.txt", prompt, true))
+        return false;
+
+    // Invoke openai.exe
+    const uint32_t MAX_TRIES = 3;
+    uint32_t num_tries;
+    
+    for (num_tries = 0; num_tries < MAX_TRIES; ++num_tries)
+    {
+        if (num_tries)
+            uprintf("openai.exe failed - retrying\n");
+
+        int status = system("openai.exe i.txt o.txt");
+        if (status == EXIT_SUCCESS)
+            break;
+        Sleep(2000);
+    }
+
+    if (num_tries == MAX_TRIES)
+        return false;
+
+    // Read output file.
+    if (!read_text_file("o.txt", reply, true, nullptr))
+    {
+        // Wait a bit and try again, rarely needed under Windows.
+        Sleep(50);
+        if (!read_text_file("o.txt", reply, true, nullptr))
+            return false;
+    }
+
+    return true;
+}
+
 std::string get_deg_to_dms(double deg)
 {
    deg = std::round(fabs(deg) * 3600.0f);
@ -1186,7 +1225,8 @@ int get_next_utf8_code_point_len(const uint8_t* pStr)
 void get_string_words(
    const std::string& str,
    string_vec& words,
-    uint_vec* pOffsets_vec)
+    uint_vec* pOffsets_vec,
+    const char* pAdditional_whitespace)
 {
    const uint8_t* pStr = (const uint8_t *)str.c_str();

@ -1196,7 +1236,9 @@ void get_string_words(

    std::string cur_token;

-    const std::string whitespace(" \t\n\r,;:.!?()[]*/\"");
+    std::string whitespace(" \t\n\r,;:.!?()[]*/\"");
+    if (pAdditional_whitespace)
+        whitespace += std::string(pAdditional_whitespace);
    
    int word_start_ofs = -1;
    
@ -1234,7 +1276,7 @@ void get_string_words(
            if (pStr[cur_ofs + 2] == 0x93)
                is_whitespace = true;
            // dash
-            if (pStr[cur_ofs + 2] == 0x94)
+            else if (pStr[cur_ofs + 2] == 0x94)
                is_whitespace = true;
            // left quote
            else if (pStr[cur_ofs + 2] == 0x9C)
@ -1315,3 +1357,369 @@ void get_utf8_code_point_offsets(const char* pStr, int_vec& offsets)
        cur_ofs += std::max<int>(1, get_next_utf8_code_point_len((const uint8_t*)pStr + cur_ofs));
    }
 }
+
+struct char_map
+{
+    const char32_t* m_pFrom;
+    const char m_to;
+};
+
+static const char_map g_char_norm_up[] =
+{
+    { U"ÁĂẮẶẰẲẴǍÂẤẬẦẨẪÄǞȦǠẠȀÀẢȂĀĄÅǺḀÃǼǢȺΆ", 'A' },
+    { U"ḂḄḆƁƂƄ", 'B' },
+    { U"ĆČÇḈĈĊƇȻƆ", 'C' },
+    { U"ĎḐḒḊḌḎĐƉƊƋǱǲǄ", 'D' },
+    { U"ÉĔĚȨḜÊẾỆỀỂỄḘËĖẸȄÈẺȆĒḖḔĘẼḚÈÊËĒĔĖĘĚƐƎƏȄȆȨΈΉΕƐƐ", 'E' },
+    { U"ḞƑ", 'F' },
+    { U"ǴĞǦĢĜĠḠĜĞĠĢƓǤǦǴƔ", 'G' },
+    { U"ḪȞḨĤḦḢḤĤĦǶȞΗǶ", 'H' },
+    { U"ÍĬǏÎÏḮİỊȈÌỈȊĪĮĨḬÌÍÎÏĨĪĬĮİƗǏȈȊ", 'I' },
+    { U"ĴĴ", 'J' },
+    { U"ḰǨĶḲḴĶƘǨΚ", 'K' },
+    { U"ĹĽĻḼḶḸḺĹĻĽĿŁΛ", 'L' },
+    { U"ḾṀṂƜ", 'M' },
+    { U"ŃŇŅṊṄṆǸṈÑÑŃŅŇŊƝǸΝ", 'N' },
+    { U"ÓŎǑÔỐỘỒỔỖÖȪȮȰỌŐȌÒỎƠỚỢỜỞỠȎŌṒṐǪǬÕṌṎȬǾØÒÓÔÕÖØŌŎŐƟƠǑǪǬǾȌȎȪȬȮȰΌΟΩ", 'O' },
+    { U"ṔṖΠΡΦ", 'P' },
+    { U"ŔŘŖṘṚṜȐȒṞŔŖŘƦȐȒ", 'R' },
+    { U"ŚṤŠṦŞŜȘṠṢṨßŚŜŞŠƩȘΣ", 'S' },
+    { U"ŤŢṰȚṪṬṮŢŤŦƬƮȚΤ", 'T' },
+    { U"ÚŬǓÛṶÜǗǙǛǕṲỤŰȔÙỦƯỨỰỪỬỮȖŪṺŲŮŨṸṴÙÚÛÜŨŪŬŮŰŲƯǓǕǗǙǛȔȖ", 'U' },
+    { U"ṾṼƲ", 'V' },
+    { U"ẂŴẄẆẈẀŴ", 'W' },
+    { U"ẌẊΧΞ", 'X' },
+    { U"ÝŶŸẎỴỲỶȲỸÝŶŸƳȲΥΎΫ", 'Y' },
+    { U"ŹŽẐŻẒẔŹŻŽƵƷǮȤΖ", 'Z' },
+};
+
+static const char_map g_char_norm_lower[] =
+{
+    { U"áăắặằẳẵǎâấậầẩẫäǟȧǡạȁàảȃāąåǻḁãǽǣⱥάàáâãäåāăąǎǟǡǻȁȃȧάα", 'a' },
+    { U"ḃḅḇɓƃƅƀƃβƀƃƅ", 'b' },
+    { U"ćčçḉĉċƈȼɔƈçćĉċčƈȼ", 'c' },
+    { U"ďḑḓḋḍḏđɖɗƌǳǳǆƌďđƌǳǆȡďđƌǳǆȡ", 'd' },
+    { U"éĕěȩḝêếệềểễḙëėẹȅèẻȇēḗḕęẽḛèêëēĕėęěɛǝəȅȇȩέήεɛɛèéêëēĕėęěȅȇȩε", 'e' },
+    { U"ḟƒ", 'f' },
+    { U"ǵğǧģĝġḡĝğġģɠǥǧǵɣĝğġģǧǵ", 'g' },
+    { U"ḫȟḩĥḧḣḥẖĥħƕƕȟƕĥħȟ", 'h' },
+    { U"íĭǐîïḯiịȉìỉȋīįĩḭìíîïĩīĭįiɨǐȉȋìíîïĩīĭįǐȉȋι", 'i' },
+    { U"ǰĵĵǰĵǰ", 'j' },
+    { U"ḱǩķḳḵķƙǩκƙķƙǩκ", 'k' },
+    { U"ĺľļḽḷḹḻĺļľŀłƚƛλƚĺļľŀłƚλƚ", 'l' },
+    { U"ḿṁṃɯ", 'm' },
+    { U"ńňņṋṅṇǹṉññńņňŋɲǹνƞñńņňŉŋƞǹη", 'n' },
+    { U"óŏǒôốộồổỗöȫȯȱọőȍòỏơớợờởỡȏōṓṑǫǭõṍṏȭǿøòóôõöøōŏőɵơǒǫǭǿȍȏȫȭȯȱόοòóôõöøōŏőơǒǫǭǿȍȏȫȭȯȱοσ", 'o' },
+    { U"ṕṗπφƥ", 'p' },
+    { U"ŕřŗṙṛṝȑȓṟŕŗřʀȑȓρŕŗřȑȓρ", 'r' },
+    { U"śṥšṧşŝșṡẛṣṩśŝşšʃșƨśŝşšșƨȿ", 's' },
+    { U"ťţṱțẗṫṭṯţťŧƭʈțτƫţťŧƭțτ", 't' },
+    { U"úŭǔûṷüǘǚǜǖṳụűȕùủưứựừửữȗūṻųůũṹṵùúûüũūŭůűųưǔǖǘǚǜȕȗưùúûüũūŭůűųưǔǖǘǚǜȕȗμ", 'u' },
+    { U"ṿṽʋ", 'v' },
+    { U"ẃŵẅẇẉẁẘŵŵω", 'w' },
+    { U"ẍẋχξχξ", 'x' },
+    { U"ýŷÿẏỵỳỷȳẙỹýŷÿƴȳυύϋƴýÿŷƴȳγψ", 'y' },
+    { U"źžẑżẓẕźżžƶʒǯȥζƶźżžƶƹȥζ", 'z' },
+};
+
+std::map<int, int> g_upper_trans;
+std::map<int, int> g_lower_trans;
+
+static const char* g_stop_words[] =
+{
+    "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as",
+    "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can",
+    "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from",
+    "further", "had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him", "himself",
+    "his", "how", "i", "if", "in", "into", "is", "it", "its", "itself", "just", "me", "more", "most",
+    "my", "myself", "no", "nor", "not", "now", "of", "off", "on", "once", "only", "or", "other", "our",
+    "ours", "ourselves", "out", "over", "own", "re", "same", "she", "should", "so", "some", "such",
+    "than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they",
+    "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "were", "what",
+    "when", "where", "which", "while", "who", "whom", "why", "will", "with", "you", "your", "yours",
+    "yourself", "yourselves", "although", "also", "already", "another", "seemed", "seem", "seems"
+};
+static const uint32_t NUM_STOP_WORDS = (uint32_t)std::size(g_stop_words);
+
+std::set<std::string> g_stop_words_set;
+
+void init_norm()
+{
+    g_stop_words_set.clear();
+    for (const auto& str : g_stop_words)
+        g_stop_words_set.insert(str);
+
+    for (uint32_t i = 0; i < std::size(g_char_norm_up); i++)
+    {
+        const char32_t* pFrom = g_char_norm_up[i].m_pFrom;
+        char to_char = g_char_norm_up[i].m_to;
+
+        while (*pFrom)
+        {
+            char32_t fc = *pFrom++;
+
+            auto f = g_upper_trans.find(fc);
+            if (f != g_upper_trans.end())
+            {
+                if (f->second != to_char)
+                {
+                    uprintf("Upper char %u 0x%x is redundant\n", fc, fc);
+                    exit(1);
+                }
+            }
+
+            g_upper_trans[fc] = to_char;
+        }
+    }
+
+    for (uint32_t i = 0; i < std::size(g_char_norm_lower); i++)
+    {
+        const char32_t* pFrom = g_char_norm_lower[i].m_pFrom;
+        char to_char = g_char_norm_lower[i].m_to;
+
+        while (*pFrom)
+        {
+            char32_t fc = *pFrom++;
+
+            auto f = g_upper_trans.find(fc);
+            if (f != g_upper_trans.end())
+            {
+                uprintf("Lower char %u 0x%x is in the upper table\n", fc, fc);
+
+                if (utolower((uint8_t)f->second) != to_char)
+                    uprintf("Conversion mismatch %u 0x%x\n", fc, fc);
+
+                //exit(1);
+            }
+
+            f = g_lower_trans.find(fc);
+            if (f != g_lower_trans.end())
+            {
+                if (f->second != to_char)
+                {
+                    uprintf("Lower char %u 0x%x is redundant\n", fc, fc);
+                    exit(1);
+                }
+            }
+
+            g_lower_trans[fc] = to_char;
+        }
+    }
+}
+
+// Resulting characters are guaranteed to be <128 - useful for searching purposes. 
+// Unrecognized Unicode characters are deleted.
+void normalize_diacritics(const char* pStr, std::string& res)
+{
+    assert(g_stop_words_set.size());
+
+    res.resize(0);
+
+    while (*pStr)
+    {
+        int l = get_next_utf8_code_point_len((const uint8_t*)pStr);
+        const uint8_t c = *pStr;
+
+        utf8_int32_t cp;
+        char* pStr_next = utf8codepoint(pStr, &cp);
+
+        assert((pStr_next - pStr) == l);
+
+        if (cp < 128)
+        {
+            res.push_back((char)cp);
+            pStr = pStr_next;
+            continue;
+        }
+
+        int new_char = -1;
+
+        auto u_it = g_upper_trans.find(cp);
+        auto l_it = g_lower_trans.find(cp);
+
+        if (u_it != g_upper_trans.end())
+            new_char = u_it->second;
+        else if (l_it != g_lower_trans.end())
+            new_char = l_it->second;
+        else
+        {
+            // FIXME: this is lame, it parses the utf8 directly.
+
+            if ((l == 2) && (c == 0xc2))
+            {
+                // NO-BREAK SPACE
+                if ((uint8_t)pStr[1] == 0xa0)
+                    new_char = ' ';
+            }
+
+            if ((l == 2) && (c == 0xCA))
+            {
+                // single left quote
+                if ((uint8_t)pStr[1] == 0xBB)
+                    new_char = '\'';
+            }
+
+            if ((l == 3) && (c == 0xE2) && ((uint8_t)pStr[1] == 0x80))
+            {
+                // dash
+                if ((uint8_t)pStr[2] == 0x93)
+                    new_char = '-';
+                // dash
+                else if ((uint8_t)pStr[2] == 0x94)
+                    new_char = '-';
+                // left quote
+                else if ((uint8_t)pStr[2] == 0x9C)
+                    new_char = '"';
+                // right quote
+                else if ((uint8_t)pStr[2] == 0x9D)
+                    new_char = '"';
+                // ellipsis (three dots)
+                else if ((uint8_t)pStr[2] == 0xA)
+                    new_char = '.';
+                // ellipsis (three dots)
+                else if ((uint8_t)pStr[2] == 0xA6)
+                    new_char = '.';
+                // long dash
+                else if ((uint8_t)pStr[2] == 9)
+                    new_char = '-';
+                // left single quote
+                else if ((uint8_t)pStr[2] == 0x98)
+                    new_char = '\'';
+                // right single quote
+                else if ((uint8_t)pStr[2] == 0x99)
+                    new_char = '\'';
+                // right double quote
+                else if ((uint8_t)pStr[2] == 0x9D)
+                    new_char = '"';
+            }
+        }
+
+        // TODO: Do something smarter?
+        if (new_char != -1)
+            res.push_back((char)new_char);
+
+        pStr = pStr_next;
+    }
+}
+
+std::string normalize_word(const std::string& str)
+{
+    assert(g_stop_words_set.size());
+
+    const uint32_t MAX_STRING_SIZE = 4096;
+
+    if (str.size() > MAX_STRING_SIZE)
+        panic("String too long");
+        
+    char buf[MAX_STRING_SIZE + 1];
+    strcpy_s(buf, sizeof(buf), str.c_str());
+    
+    // Convert utf8 string to lower
+    utf8lwr(buf);
+
+    // Remove diacritics and some specials from utf8, this preserves all 1-127 chars
+    std::string norm;
+    norm.reserve(strlen(buf));
+
+    normalize_diacritics(buf, norm);
+    
+    // Remove any non-letter or non-digit characters (we assume this is a word, so whitespace gets removed too)
+    std::string temp;
+    temp.reserve(norm.size());
+
+    for (uint32_t i = 0; i < norm.size(); i++)
+    {
+        uint8_t c = norm[i];
+
+        c = utolower(c);
+
+        if (uislower(c) || uisdigit(c))
+            temp.push_back(c);
+    }
+
+    // Stem word
+    strcpy_s(buf, sizeof(buf), temp.c_str());
+    if (buf[0])
+    {
+        int32_t new_len = stem(buf, 0, (int)strlen(buf) - 1);
+        buf[new_len + 1] = '\0';
+    }
+
+    return buf;
+}
+
+// Assumes word is plain ASCII lowercase
+bool is_stop_word(const std::string &word)
+{
+    assert(g_stop_words_set.size());
+
+    return g_stop_words_set.count(word) != 0;
+}
+
+std::string ustrlwr(const std::string& s)
+{
+    const size_t l = s.size();
+
+    std::vector<uint8_t> temp;
+    temp.resize(l + 1);
+
+    memcpy(&temp[0], s.c_str(), l);
+    temp[l] = '\0';
+
+    utf8lwr((char *)&temp[0]);
+
+    return (char *)&temp[0];
+}
+
+std::string string_replace(const std::string& str, const std::string& find, const std::string& repl)
+{
+    assert(find.size());
+    if (!find.size() || !str.size())
+        return str;
+    
+    const uint8_t* pStr = (const uint8_t *)str.c_str();
+    const size_t str_size = str.size();
+    
+    const uint8_t* pFind = (const uint8_t*)find.c_str();
+    const size_t find_size = find.size();
+
+    std::string res;
+    res.reserve(str.size());
+
+    size_t str_ofs = 0;
+    while (str_ofs < str.size())
+    {
+        int str_char_size = get_next_utf8_code_point_len(pStr + str_ofs);
+        if (str_char_size < 0)
+        {
+            assert(0);
+            str_char_size = 1;
+        }
+        
+        const size_t str_remaining = str_size - str_ofs;
+        if ((str_remaining >= find_size) && (memcmp(pStr + str_ofs, pFind, find_size) == 0))
+        {
+            res += repl;
+            str_ofs += find_size;
+        }
+        else
+        {
+            for (int i = 0; i < str_char_size; i++)
+                res.push_back((char)pStr[str_ofs + i]);
+            str_ofs += str_char_size;
+        }
+    }
+
+    return res;
+}
+
+bool does_file_exist(const char* pFilename)
+{
+    FILE* pFile = ufopen(pFilename, "rb");
+    if (!pFile)
+        return false;
+    
+    fclose(pFile);
+    return true;
+}
+
--- a/utils.h
+++ b/utils.h
@ -240,7 +240,7 @@ bool read_text_file(const char* pFilename, string_vec& lines, bool trim_lines, b

 bool read_text_file(const char* pFilename, std::vector<uint8_t>& buf, bool *pUTF8_flag);

-bool write_text_file(const char* pFilename, string_vec& lines, bool utf8_bom = true);
+bool write_text_file(const char* pFilename, const string_vec& lines, bool utf8_bom = true);

 bool serialize_to_json_file(const char* pFilename, const json& j, bool utf8_bom);

@ -251,6 +251,7 @@ bool invoke_curl(const std::string& args, string_vec& reply);
 void convert_args_to_utf8(string_vec& args, int argc, wchar_t* argv[]);

 bool invoke_openai(const std::string& prompt, std::string& reply);
+bool invoke_openai(const string_vec& prompt, string_vec& reply);

 std::string get_deg_to_dms(double deg);

@ -269,5 +270,16 @@ double geo_distance(double lat1, double lon1, double lat2, double lon2, int unit
 std::string remove_bom(std::string str);

 int get_next_utf8_code_point_len(const uint8_t* pStr);
-void get_string_words(const std::string& str, string_vec& words, uint_vec* pOffsets_vec);
-void get_utf8_code_point_offsets(const char* pStr, int_vec& offsets);
+void get_string_words(const std::string& str, string_vec& words, uint_vec* pOffsets_vec, const char *pAdditional_whitespace = nullptr);
+void get_utf8_code_point_offsets(const char* pStr, int_vec& offsets);
+
+void init_norm();
+void normalize_diacritics(const char* pStr, std::string& res);
+std::string normalize_word(const std::string& str);
+bool is_stop_word(const std::string& word);
+
+std::string ustrlwr(const std::string& s);
+
+std::string string_replace(const std::string& str, const std::string& find, const std::string& repl);
+
+bool does_file_exist(const char* pFilename);