updated files

This commit is contained in:
Rich Geldreich 2023-10-05 14:07:39 -04:00
parent 43d0d882ef
commit a70cef4dde
12 changed files with 4072 additions and 834 deletions

File diff suppressed because it is too large Load Diff

View File

@ -5,11 +5,14 @@
void converters_init(); void converters_init();
bool convert_magnonia(const char* pSrc_filename, const char* pDst_filename, const char* pSource_override = nullptr, const char* pRef_override = nullptr); bool convert_magonia(const char* pSrc_filename, const char* pDst_filename, const char* pSource_override = nullptr, const char* pRef_override = nullptr, uint32_t TOTAL_COLS = 15, const char *pType_override = nullptr, bool parens_flag = true, uint32_t first_rec_index = 1);
bool convert_dolan(const char* pSrc_filename, const char* pDst_filename, const char* pSource, const char* pType, const char* pRef);
bool convert_bluebook_unknowns(); bool convert_bluebook_unknowns();
bool convert_hall(); bool convert_hall();
bool convert_eberhart(unordered_string_set& unique_urls); bool convert_eberhart(unordered_string_set& unique_urls);
bool convert_johnson(); bool convert_johnson();
bool convert_nicap(unordered_string_set& unique_urls); bool convert_nicap(unordered_string_set& unique_urls);
bool convert_nuk(); bool convert_nuk();
bool convert_anon(); bool convert_anon();
bool convert_rr0();
bool convert_overmeire();

View File

@ -2,6 +2,435 @@
// markdown_proc.cpp // markdown_proc.cpp
#include "markdown_proc.h" #include "markdown_proc.h"
struct markdown
{
enum
{
cCodeSig = 0xFE,
cCodeLink = 1,
cCodeEmphasis,
cCodeText,
cCodeParagraph,
cCodeLinebreak,
cCodeHTML
};
static void bufappend(struct buf* out, struct buf* in)
{
assert(in != out);
if (in && in->size)
bufput(out, in->data, in->size);
}
static void writelen(struct buf* ob, uint32_t size)
{
bufputc(ob, (uint8_t)(size & 0xFF));
bufputc(ob, (uint8_t)((size >> 8) & 0xFF));
bufputc(ob, (uint8_t)((size >> 16) & 0xFF));
bufputc(ob, (uint8_t)((size >> 24) & 0xFF));
}
static std::string get_string(const std::string& buf, uint32_t& cur_ofs, uint32_t text_size)
{
std::string text;
if (cur_ofs + text_size > buf.size())
panic("Buffer too small");
text.append(buf.c_str() + cur_ofs, text_size);
cur_ofs += text_size;
return text;
}
static uint32_t get_len32(const std::string& buf, uint32_t& ofs)
{
if ((ofs + 4) > buf.size())
panic("Buffer too small");
uint32_t l = (uint8_t)buf[ofs] |
(((uint8_t)buf[ofs + 1]) << 8) |
(((uint8_t)buf[ofs + 2]) << 16) |
(((uint8_t)buf[ofs + 3]) << 24);
ofs += 4;
return l;
}
static void prolog(struct buf* ob, void* opaque)
{
}
static void epilog(struct buf* ob, void* opaque)
{
}
/* block level callbacks - NULL skips the block */
static void blockcode(struct buf* ob, struct buf* text, void* opaque)
{
#if 0
bufprintf(ob, "blockcode: \"%.*s\" ", (int)text->size, text->data);
#endif
panic("unsupported markdown feature");
}
static void blockquote(struct buf* ob, struct buf* text, void* opaque)
{
#if 0
bufprintf(ob, "blockquote: \"%.*s\" ", (int)text->size, text->data);
#endif
// TODO: unsupported block quotes (here for when we're converting to plain text)
//panic("unsupported markdown feature");
if (opaque)
*(bool*)opaque = true;
if (!text || !text->size)
return;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeParagraph);
writelen(ob, (uint32_t)text->size);
bufappend(ob, text);
}
static void blockhtml(struct buf* ob, struct buf* text, void* opaque)
{
#if 0
bufprintf(ob, "blockhtml: \"%.*s\" ", (int)text->size, text->data);
#endif
// TODO: Not fully supported - just dropping it
//panic("unsupported markdown feature");
if (opaque)
*(bool*)opaque = true;
}
static void header(struct buf* ob, struct buf* text, int level, void* opaque)
{
#if 0
bufprintf(ob, "header: %i \"%.*s\" ", level, (int)text->size, text->data);
#endif
// TODO: Not fully supported
//panic("unsupported markdown feature");
if (opaque)
*(bool*)opaque = true;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeParagraph);
writelen(ob, (uint32_t)text->size);
bufappend(ob, text);
}
static void hrule(struct buf* ob, void* opaque)
{
// TODO
//panic("unsupported markdown feature");
if (opaque)
*(bool*)opaque = true;
}
static void list(struct buf* ob, struct buf* text, int flags, void* opaque)
{
// TODO: not fully supporting lists (here for when we're converting to plain text)
//panic("unsupported markdown feature");
if (opaque)
*(bool*)opaque = true;
if (!text || !text->size)
return;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeParagraph);
writelen(ob, (uint32_t)text->size);
bufappend(ob, text);
}
static void listitem(struct buf* ob, struct buf* text, int flags, void* opaque)
{
// TODO: not fully supporting lists (here for when we're converting to plain text)
//panic("unsupported markdown feature");
if (opaque)
*(bool*)opaque = true;
if (!text || !text->size)
return;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeParagraph);
writelen(ob, (uint32_t)text->size);
bufappend(ob, text);
}
static void paragraph(struct buf* ob, struct buf* text, void* opaque)
{
#if 0
bufprintf(ob, "paragraph: \"%.*s\" ", (int)text->size, text->data);
#endif
if (!text || !text->size)
return;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeParagraph);
writelen(ob, (uint32_t)text->size);
bufappend(ob, text);
}
static void table(struct buf* ob, struct buf* head_row, struct buf* rows, void* opaque)
{
#if 0
bufprintf(ob, "table: \"%.*s\" \"%.*s\" ", (int)head_row->size, head_row->data, (int)rows->size, rows->data);
#endif
//panic("unsupported markdown feature");
// TODO: not fully supported, just for plaintext conversion
if (opaque)
*(bool*)opaque = true;
}
static void table_cell(struct buf* ob, struct buf* text, int flags, void* opaque)
{
#if 0
bufprintf(ob, "table_cell: \"%.*s\" %i ", (int)text->size, text->data, flags);
#endif
//panic("unsupported markdown feature");
if (opaque)
*(bool*)opaque = true;
// TODO: not fully supported, just for plaintext conversion
if (!text || !text->size)
return;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeParagraph);
writelen(ob, (uint32_t)text->size);
bufappend(ob, text);
}
static void table_row(struct buf* ob, struct buf* cells, int flags, void* opaque)
{
#if 0
bufprintf(ob, "table_row: \"%.*s\" %i ", (int)cells->size, cells->data, flags);
#endif
//panic("unsupported markdown feature");
// TODO: not fully supported, just for plaintext conversion
if (opaque)
*(bool*)opaque = true;
}
static int autolink(struct buf* ob, struct buf* link, enum mkd_autolink type, void* opaque)
{
#if 0
bufprintf(ob, "autolink: %u \"%.*s\" ", type, (int)link->size, link->data);
#endif
panic("unsupported markdown feature");
return 1;
}
static int codespan(struct buf* ob, struct buf* text, void* opaque)
{
#if 0
bufprintf(ob, "codespan: \"%.*s\" ", (int)text->size, text->data);
#endif
//panic("unsupported markdown feature");
if (opaque)
*(bool*)opaque = true;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeText);
writelen(ob, (uint32_t)text->size);
bufappend(ob, text);
return 1;
}
static int double_emphasis(struct buf* ob, struct buf* text, char c, void* opaque)
{
#if 0
bufprintf(ob, "double_emphasis: %u ('%c') [%.*s] ", c, c, (int)text->size, text->data);
#endif
if (!text || !text->size)
return 1;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeEmphasis);
bufputc(ob, c);
bufputc(ob, 2);
writelen(ob, (uint32_t)text->size);
bufappend(ob, text);
return 1;
}
static int emphasis(struct buf* ob, struct buf* text, char c, void* opaque)
{
#if 0
bufprintf(ob, "emphasis: %u ('%c') [%.*s] ", c, c, (int)text->size, text->data);
#endif
if (!text || !text->size)
return 1;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeEmphasis);
bufputc(ob, c);
bufputc(ob, 1);
writelen(ob, (uint32_t)text->size);
bufappend(ob, text);
return 1;
}
static int image(struct buf* ob, struct buf* link, struct buf* title, struct buf* alt, void* opaque)
{
#if 0
bufprintf(ob, "image: \"%.*s\" \"%.*s\" \"%.*s\" ",
(int)link->size, link->data,
(int)title->size, title->data,
(int)alt->size, alt->data);
#endif
//panic("unsupported markdown feature");
if (opaque)
*(bool*)opaque = true;
if (alt)
{
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeText);
writelen(ob, (uint32_t)alt->size);
bufappend(ob, alt);
}
return 1;
}
static int linebreak(struct buf* ob, void* opaque)
{
#if 0
bufprintf(ob, "linebreak ");
#endif
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeLinebreak);
return 1;
}
static int link(struct buf* ob, struct buf* link, struct buf* title, struct buf* content, void* opaque)
{
#if 0
printf("link: {%.*s} {%.*s} {%.*s}\n",
link ? (int)link->size : 0,
link ? link->data : nullptr,
title ? (int)title->size : 0,
title ? title->data : nullptr,
content ? (int)content->size : 0,
content ? content->data : nullptr);
#endif
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeLink);
writelen(ob, (uint32_t)link->size);
writelen(ob, (uint32_t)content->size);
bufappend(ob, link);
bufappend(ob, content);
return 1;
}
static int raw_html_tag(struct buf* ob, struct buf* tag, void* opaque)
{
//bufprintf(ob, "raw_html_tag: \"%.*s\" ", (int)tag->size, tag->data);
if (!tag || !tag->size)
return 1;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeHTML);
writelen(ob, (uint32_t)tag->size);
bufappend(ob, tag);
return 1;
}
static int triple_emphasis(struct buf* ob, struct buf* text, char c, void* opaque)
{
//bufprintf(ob, "triple_emphasis: %u ('%c') [%.*s] ", c, c, (int)text->size, text->data);
if (!text || !text->size)
return 1;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeEmphasis);
bufputc(ob, c);
bufputc(ob, 3);
writelen(ob, (uint32_t)text->size);
bufappend(ob, text);
return 1;
}
static void normal_text(struct buf* ob, struct buf* text, void* opaque)
{
if (!text || !text->size)
return;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeText);
writelen(ob, (uint32_t)text->size);
for (uint32_t i = 0; i < text->size; i++)
{
uint8_t c = text->data[i];
if (c == '\n')
bufputc(ob, ' ');
else if (c != 1)
{
assert(c >= 32 || c == '\t');
bufputc(ob, c);
}
}
}
};
static struct mkd_renderer g_mkd_parse =
{
markdown::prolog,
markdown::epilog,
markdown::blockcode,
markdown::blockquote,
markdown::blockhtml,
markdown::header,
markdown::hrule,
markdown::list,
markdown::listitem,
markdown::paragraph,
markdown::table,
markdown::table_cell,
markdown::table_row,
markdown::autolink,
markdown::codespan,
markdown::double_emphasis,
markdown::emphasis,
markdown::image,
markdown::linebreak,
markdown::link,
markdown::raw_html_tag,
markdown::triple_emphasis,
//markdown::entity,
nullptr,
markdown::normal_text,
64,
"*_",
nullptr
};
static bool markdown_should_escape(int c) static bool markdown_should_escape(int c)
{ {
switch (c) switch (c)
@ -48,12 +477,14 @@ static std::string escape_markdown(const std::string& str)
return out; return out;
} }
markdown_text_processor::markdown_text_processor() markdown_text_processor::markdown_text_processor() :
m_used_unsupported_feature(false)
{ {
} }
void markdown_text_processor::clear() void markdown_text_processor::clear()
{ {
m_used_unsupported_feature = false;
m_text.clear(); m_text.clear();
m_details.clear(); m_details.clear();
m_links.clear(); m_links.clear();
@ -126,7 +557,10 @@ void markdown_text_processor::init_from_markdown(const char* pText)
bufputs(pIn, pText); bufputs(pIn, pText);
struct buf* pOut = bufnew(4096); struct buf* pOut = bufnew(4096);
markdown(pOut, pIn, &mkd_parse);
m_used_unsupported_feature = false;
g_mkd_parse.opaque = &m_used_unsupported_feature;
markdown(pOut, pIn, &g_mkd_parse);
std::string buf; std::string buf;
buf.append((char*)pOut->data, pOut->size); buf.append((char*)pOut->data, pOut->size);
@ -243,6 +677,9 @@ void markdown_text_processor::convert_to_plain(std::string& out, bool trim_end)
void markdown_text_processor::convert_to_markdown(std::string& out, bool trim_end) const void markdown_text_processor::convert_to_markdown(std::string& out, bool trim_end) const
{ {
if (m_used_unsupported_feature)
printf("markdown_text_processor::convert_to_markdown: Warning, one or more Markdown features were used in this text and won't be losslessly converted.\n");
int emphasis = 0, emphasis_amount = 0; int emphasis = 0, emphasis_amount = 0;
int cur_link_index = -1; int cur_link_index = -1;
@ -606,3 +1043,66 @@ void markdown_text_processor::handle_emphasis(std::string& out, uint32_t text_of
// out += m_details[text_ofs].m_html[i]; // out += m_details[text_ofs].m_html[i];
} }
} }
#if 0
const char* pText =
u8R"(
<ul>test</ul>
_text1_
**text2**
**_text3_**
![alt text](https://github.com/n48.png "Logo Title")
# Heading 1
## Heading 2
### Heading 3
1. XXXXX
1. Item 1
2. Item 2
2. YYYYY
3. ZZZZZ
| Tables | Are | Cool |
| ------------- |:-------------:| -----:|
| col 3 is | right-aligned | $1600 |
| col 2 is | centered | $12 |
| zebra stripes | are neat | $1 |
* [blahblah](www.blah1.com)
* [blahblah2](www.blah2.com)
`
this is code 1
this is code 2
`
```
this is code 3
this is code 4
```
> blockquote 1
> blockquote 2
---
* AAA
* BBB
* ZZZZ1
* ZZZZ2
* CCC)";
markdown_text_processor tp;
tp.init_from_markdown(pText);
std::string desc;
tp.convert_to_plain(desc, true);
uprintf("%s\n", desc.c_str());
return 0;
#endif

View File

@ -6,386 +6,6 @@
#include "libsoldout/markdown.h" #include "libsoldout/markdown.h"
struct markdown
{
enum
{
cCodeSig = 0xFE,
cCodeLink = 1,
cCodeEmphasis,
cCodeText,
cCodeParagraph,
cCodeLinebreak,
cCodeHTML
};
static void bufappend(struct buf* out, struct buf* in)
{
assert(in != out);
if (in && in->size)
bufput(out, in->data, in->size);
}
static void writelen(struct buf* ob, uint32_t size)
{
bufputc(ob, (uint8_t)(size & 0xFF));
bufputc(ob, (uint8_t)((size >> 8) & 0xFF));
bufputc(ob, (uint8_t)((size >> 16) & 0xFF));
bufputc(ob, (uint8_t)((size >> 24) & 0xFF));
}
static std::string get_string(const std::string& buf, uint32_t& cur_ofs, uint32_t text_size)
{
std::string text;
if (cur_ofs + text_size > buf.size())
panic("Buffer too small");
text.append(buf.c_str() + cur_ofs, text_size);
cur_ofs += text_size;
return text;
}
static uint32_t get_len32(const std::string& buf, uint32_t& ofs)
{
if ((ofs + 4) > buf.size())
panic("Buffer too small");
uint32_t l = (uint8_t)buf[ofs] |
(((uint8_t)buf[ofs + 1]) << 8) |
(((uint8_t)buf[ofs + 2]) << 16) |
(((uint8_t)buf[ofs + 3]) << 24);
ofs += 4;
return l;
}
static void prolog(struct buf* ob, void* opaque)
{
}
static void epilog(struct buf* ob, void* opaque)
{
}
/* block level callbacks - NULL skips the block */
static void blockcode(struct buf* ob, struct buf* text, void* opaque)
{
#if 0
bufprintf(ob, "blockcode: \"%.*s\" ", (int)text->size, text->data);
#endif
panic("unsupported markdown feature");
}
static void blockquote(struct buf* ob, struct buf* text, void* opaque)
{
#if 0
bufprintf(ob, "blockquote: \"%.*s\" ", (int)text->size, text->data);
#endif
// TODO: unsupported block quotes (here for when we're converting to plain text)
//panic("unsupported markdown feature");
if (!text || !text->size)
return;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeParagraph);
writelen(ob, (uint32_t)text->size);
bufappend(ob, text);
}
static void blockhtml(struct buf* ob, struct buf* text, void* opaque)
{
#if 0
bufprintf(ob, "blockhtml: \"%.*s\" ", (int)text->size, text->data);
#endif
panic("unsupported markdown feature");
}
static void header(struct buf* ob, struct buf* text, int level, void* opaque)
{
#if 0
bufprintf(ob, "header: %i \"%.*s\" ", level, (int)text->size, text->data);
#endif
panic("unsupported markdown feature");
}
static void hrule(struct buf* ob, void* opaque)
{
panic("unsupported markdown feature");
}
static void list(struct buf* ob, struct buf* text, int flags, void* opaque)
{
// TODO: not fully supporting lists (here for when we're converting to plain text)
//panic("unsupported markdown feature");
if (!text || !text->size)
return;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeParagraph);
writelen(ob, (uint32_t)text->size);
bufappend(ob, text);
}
static void listitem(struct buf* ob, struct buf* text, int flags, void* opaque)
{
// TODO: not fully supporting lists (here for when we're converting to plain text)
//panic("unsupported markdown feature");
if (!text || !text->size)
return;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeParagraph);
writelen(ob, (uint32_t)text->size);
bufappend(ob, text);
}
static void paragraph(struct buf* ob, struct buf* text, void* opaque)
{
#if 0
bufprintf(ob, "paragraph: \"%.*s\" ", (int)text->size, text->data);
#endif
if (!text || !text->size)
return;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeParagraph);
writelen(ob, (uint32_t)text->size);
bufappend(ob, text);
}
static void table(struct buf* ob, struct buf* head_row, struct buf* rows, void* opaque)
{
#if 0
bufprintf(ob, "table: \"%.*s\" \"%.*s\" ", (int)head_row->size, head_row->data, (int)rows->size, rows->data);
#endif
//panic("unsupported markdown feature");
// TODO: not fully supported, just for plaintext conversion
}
static void table_cell(struct buf* ob, struct buf* text, int flags, void* opaque)
{
#if 0
bufprintf(ob, "table_cell: \"%.*s\" %i ", (int)text->size, text->data, flags);
#endif
//panic("unsupported markdown feature");
// TODO: not fully supported, just for plaintext conversion
if (!text || !text->size)
return;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeParagraph);
writelen(ob, (uint32_t)text->size);
bufappend(ob, text);
}
static void table_row(struct buf* ob, struct buf* cells, int flags, void* opaque)
{
#if 0
bufprintf(ob, "table_row: \"%.*s\" %i ", (int)cells->size, cells->data, flags);
#endif
//panic("unsupported markdown feature");
// TODO: not fully supported, just for plaintext conversion
}
static int autolink(struct buf* ob, struct buf* link, enum mkd_autolink type, void* opaque)
{
#if 0
bufprintf(ob, "autolink: %u \"%.*s\" ", type, (int)link->size, link->data);
#endif
panic("unsupported markdown feature");
return 1;
}
static int codespan(struct buf* ob, struct buf* text, void* opaque)
{
#if 0
bufprintf(ob, "codespan: \"%.*s\" ", (int)text->size, text->data);
#endif
panic("unsupported markdown feature");
return 1;
}
static int double_emphasis(struct buf* ob, struct buf* text, char c, void* opaque)
{
#if 0
bufprintf(ob, "double_emphasis: %u ('%c') [%.*s] ", c, c, (int)text->size, text->data);
#endif
if (!text || !text->size)
return 1;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeEmphasis);
bufputc(ob, c);
bufputc(ob, 2);
writelen(ob, (uint32_t)text->size);
bufappend(ob, text);
return 1;
}
static int emphasis(struct buf* ob, struct buf* text, char c, void* opaque)
{
#if 0
bufprintf(ob, "emphasis: %u ('%c') [%.*s] ", c, c, (int)text->size, text->data);
#endif
if (!text || !text->size)
return 1;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeEmphasis);
bufputc(ob, c);
bufputc(ob, 1);
writelen(ob, (uint32_t)text->size);
bufappend(ob, text);
return 1;
}
static int image(struct buf* ob, struct buf* link, struct buf* title, struct buf* alt, void* opaque)
{
#if 0
bufprintf(ob, "image: \"%.*s\" \"%.*s\" \"%.*s\" ",
(int)link->size, link->data,
(int)title->size, title->data,
(int)alt->size, alt->data);
#endif
panic("unsupported markdown feature");
return 1;
}
static int linebreak(struct buf* ob, void* opaque)
{
#if 0
bufprintf(ob, "linebreak ");
#endif
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeLinebreak);
return 1;
}
static int link(struct buf* ob, struct buf* link, struct buf* title, struct buf* content, void* opaque)
{
#if 0
printf("link: {%.*s} {%.*s} {%.*s}\n",
link ? (int)link->size : 0,
link ? link->data : nullptr,
title ? (int)title->size : 0,
title ? title->data : nullptr,
content ? (int)content->size : 0,
content ? content->data : nullptr);
#endif
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeLink);
writelen(ob, (uint32_t)link->size);
writelen(ob, (uint32_t)content->size);
bufappend(ob, link);
bufappend(ob, content);
return 1;
}
static int raw_html_tag(struct buf* ob, struct buf* tag, void* opaque)
{
//bufprintf(ob, "raw_html_tag: \"%.*s\" ", (int)tag->size, tag->data);
if (!tag || !tag->size)
return 1;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeHTML);
writelen(ob, (uint32_t)tag->size);
bufappend(ob, tag);
return 1;
}
static int triple_emphasis(struct buf* ob, struct buf* text, char c, void* opaque)
{
//bufprintf(ob, "triple_emphasis: %u ('%c') [%.*s] ", c, c, (int)text->size, text->data);
if (!text || !text->size)
return 1;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeEmphasis);
bufputc(ob, c);
bufputc(ob, 3);
writelen(ob, (uint32_t)text->size);
bufappend(ob, text);
return 1;
}
static void normal_text(struct buf* ob, struct buf* text, void* opaque)
{
if (!text || !text->size)
return;
bufputc(ob, (uint8_t)cCodeSig);
bufputc(ob, (uint8_t)cCodeText);
writelen(ob, (uint32_t)text->size);
for (uint32_t i = 0; i < text->size; i++)
{
uint8_t c = text->data[i];
if (c == '\n')
bufputc(ob, ' ');
else if (c != 1)
{
assert(c >= 32 || c == '\t');
bufputc(ob, c);
}
}
}
};
const struct mkd_renderer mkd_parse =
{
markdown::prolog,
markdown::epilog,
markdown::blockcode,
markdown::blockquote,
markdown::blockhtml,
markdown::header,
markdown::hrule,
markdown::list,
markdown::listitem,
markdown::paragraph,
markdown::table,
markdown::table_cell,
markdown::table_row,
markdown::autolink,
markdown::codespan,
markdown::double_emphasis,
markdown::emphasis,
markdown::image,
markdown::linebreak,
markdown::link,
markdown::raw_html_tag,
markdown::triple_emphasis,
//markdown::entity,
nullptr,
markdown::normal_text,
64,
"*_",
nullptr
};
class markdown_text_processor class markdown_text_processor
{ {
public: public:
@ -406,17 +26,23 @@ public:
std::string m_text; std::string m_text;
std::vector<detail> m_details; std::vector<detail> m_details;
string_vec m_links; string_vec m_links;
bool m_used_unsupported_feature;
markdown_text_processor(); markdown_text_processor();
void clear(); void clear();
void fix_redirect_urls(); void fix_redirect_urls();
// Note \n escapes will escape "n", not result in a CR.
void init_from_markdown(const char* pText); void init_from_markdown(const char* pText);
bool split_in_half(uint32_t ofs, markdown_text_processor& a, markdown_text_processor& b) const; bool split_in_half(uint32_t ofs, markdown_text_processor& a, markdown_text_processor& b) const;
uint32_t count_char_in_text(uint8_t c) const; uint32_t count_char_in_text(uint8_t c) const;
bool split_last_parens(markdown_text_processor& a, markdown_text_processor& b) const; bool split_last_parens(markdown_text_processor& a, markdown_text_processor& b) const;
void convert_to_plain(std::string& out, bool trim_end) const; void convert_to_plain(std::string& out, bool trim_end) const;
// Warning: Only a few core features are supported. If after parsing m_used_unsupported_feature is true, then this will not be lossless.
void convert_to_markdown(std::string& out, bool trim_end) const; void convert_to_markdown(std::string& out, bool trim_end) const;
private: private:

File diff suppressed because it is too large Load Diff

View File

@ -78,8 +78,10 @@
<ClCompile Include="libsoldout\markdown.c" /> <ClCompile Include="libsoldout\markdown.c" />
<ClCompile Include="libsoldout\renderers.c" /> <ClCompile Include="libsoldout\renderers.c" />
<ClCompile Include="markdown_proc.cpp" /> <ClCompile Include="markdown_proc.cpp" />
<ClCompile Include="stem.c" />
<ClCompile Include="udb.cpp" /> <ClCompile Include="udb.cpp" />
<ClInclude Include="converters.h" /> <ClInclude Include="converters.h" />
<ClInclude Include="stem.h" />
<ClInclude Include="udb_tables.h" /> <ClInclude Include="udb_tables.h" />
<ClCompile Include="ufojson.cpp"> <ClCompile Include="ufojson.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Level4</WarningLevel> <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Level4</WarningLevel>
@ -97,6 +99,7 @@
<ClInclude Include="resource.h" /> <ClInclude Include="resource.h" />
<ClInclude Include="udb.h" /> <ClInclude Include="udb.h" />
<ClInclude Include="ufojson_core.h" /> <ClInclude Include="ufojson_core.h" />
<ClInclude Include="utf8.h" />
<ClInclude Include="utils.h" /> <ClInclude Include="utils.h" />
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />

View File

@ -48,6 +48,9 @@
<ClCompile Include="converters.cpp"> <ClCompile Include="converters.cpp">
<Filter>Source Files</Filter> <Filter>Source Files</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="stem.c">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<ClInclude Include="libsoldout\array.h"> <ClInclude Include="libsoldout\array.h">
@ -83,5 +86,11 @@
<ClInclude Include="converters.h"> <ClInclude Include="converters.h">
<Filter>Source Files</Filter> <Filter>Source Files</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="utf8.h">
<Filter>Source Files</Filter>
</ClInclude>
<ClInclude Include="stem.h">
<Filter>Source Files</Filter>
</ClInclude>
</ItemGroup> </ItemGroup>
</Project> </Project>

View File

@ -3,13 +3,11 @@
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LocalDebuggerWorkingDirectory>bin</LocalDebuggerWorkingDirectory> <LocalDebuggerWorkingDirectory>bin</LocalDebuggerWorkingDirectory>
<DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor> <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
<LocalDebuggerCommandArguments> <LocalDebuggerCommandArguments>-convert</LocalDebuggerCommandArguments>
</LocalDebuggerCommandArguments>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LocalDebuggerWorkingDirectory>bin</LocalDebuggerWorkingDirectory> <LocalDebuggerWorkingDirectory>bin</LocalDebuggerWorkingDirectory>
<DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor> <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
<LocalDebuggerCommandArguments> <LocalDebuggerCommandArguments>-convert</LocalDebuggerCommandArguments>
</LocalDebuggerCommandArguments>
</PropertyGroup> </PropertyGroup>
</Project> </Project>

File diff suppressed because it is too large Load Diff

View File

@ -6,6 +6,7 @@
// Note that May ends in a period. // Note that May ends in a period.
extern const char* g_months[12]; extern const char* g_months[12];
extern const char* g_full_months[12]; extern const char* g_full_months[12];
extern const char* g_day_of_week[7];
const uint32_t NUM_DATE_PREFIX_STRINGS = 24; const uint32_t NUM_DATE_PREFIX_STRINGS = 24;
extern const char* g_date_prefix_strings[NUM_DATE_PREFIX_STRINGS]; extern const char* g_date_prefix_strings[NUM_DATE_PREFIX_STRINGS];
@ -47,7 +48,9 @@ enum date_prefix_t
}; };
bool is_season(date_prefix_t prefix); bool is_season(date_prefix_t prefix);
int determine_month(const std::string& date); int determine_month(const std::string& date, bool begins_with = true);
int determine_prefix(const std::string& date, bool begins_with = true);
int determine_day_of_week(const std::string& date, bool begins_with = true);
struct event_date struct event_date
{ {
@ -108,19 +111,19 @@ private:
struct timeline_event struct timeline_event
{ {
std::string m_date_str; std::string m_date_str;
std::string m_time_str; // military std::string m_time_str; // military, but currently it's in any format (not parsed yet)
std::string m_alt_date_str; std::string m_alt_date_str;
std::string m_end_date_str; std::string m_end_date_str;
event_date m_begin_date; event_date m_begin_date;
event_date m_end_date; event_date m_end_date;
event_date m_alt_date; event_date m_alt_date;
std::string m_desc; std::string m_desc; // Markdown
string_vec m_type; string_vec m_type;
string_vec m_refs; string_vec m_refs; // Markdown
string_vec m_locations; string_vec m_locations;
string_vec m_attributes; string_vec m_attributes;
string_vec m_see_also; string_vec m_see_also;
@ -138,11 +141,13 @@ struct timeline_event
std::string m_source; std::string m_source;
std::vector<string_pair> m_key_value_data; std::vector<string_pair> m_key_value_data;
std::string m_plain_desc; // Computed, ignored for comparison purposes, not deserialized from JSON
string_vec m_plain_refs; // Computed, ignored for comparison purposes, not deserialized from JSON
std::string m_search_words; // Computed, ignored for comparison purposes, not deserialized from JSON
bool operator==(const timeline_event& rhs) const; bool operator==(const timeline_event& rhs) const;
bool operator!=(const timeline_event& rhs) const; bool operator!=(const timeline_event& rhs) const;
bool operator< (const timeline_event& rhs) const; bool operator< (const timeline_event& rhs) const;
void print(FILE* pFile) const; void print(FILE* pFile) const;
@ -156,6 +161,15 @@ struct timeline_event
typedef std::vector<timeline_event> timeline_event_vec; typedef std::vector<timeline_event> timeline_event_vec;
bool date_filter_single(
int start_month, int start_day, int start_year,
const event_date& evt_b, const event_date& evt_e);
bool date_filter_range(
int start_month, int start_day, int start_year,
int end_month, int end_day, int end_year,
const event_date& evt_b, const event_date& evt_e);
const uint32_t NUM_KWIC_FILE_STRINGS = 28; const uint32_t NUM_KWIC_FILE_STRINGS = 28;
static inline std::string get_kwic_index_name(uint32_t i) static inline std::string get_kwic_index_name(uint32_t i)
@ -215,6 +229,8 @@ public:
} }
} }
void create_plaintext();
bool write_file(const char* pFilename, bool utf8_bom = true) bool write_file(const char* pFilename, bool utf8_bom = true)
{ {
json j; json j;

420
utils.cpp
View File

@ -1,6 +1,8 @@
// utils.cpp // utils.cpp
// Copyright (C) 2023 Richard Geldreich, Jr. // Copyright (C) 2023 Richard Geldreich, Jr.
#include "utils.h" #include "utils.h"
#include "utf8.h"
#include "stem.h"
std::string combine_strings(std::string a, const std::string& b) std::string combine_strings(std::string a, const std::string& b)
{ {
@ -265,7 +267,7 @@ int string_ifind_first(const std::string& str, const char* pPhrase)
for (size_t ofs = 0; ofs <= end_ofs; ofs++) for (size_t ofs = 0; ofs <= end_ofs; ofs++)
{ {
assert(ofs + phrase_size <= str_size); assert(ofs + phrase_size <= str_size);
if (_stricmp(str.c_str() + ofs, pPhrase) == 0) if (_strnicmp(str.c_str() + ofs, pPhrase, phrase_size) == 0)
return (int)ofs; return (int)ofs;
} }
@ -552,7 +554,7 @@ bool read_text_file(const char* pFilename, std::vector<uint8_t>& buf, bool *pUTF
return true; return true;
} }
bool write_text_file(const char* pFilename, string_vec& lines, bool utf8_bom) bool write_text_file(const char* pFilename, const string_vec& lines, bool utf8_bom)
{ {
FILE* pFile = ufopen(pFilename, "wb"); FILE* pFile = ufopen(pFilename, "wb");
if (!pFile) if (!pFile)
@ -984,6 +986,43 @@ bool invoke_openai(const std::string& prompt, std::string& reply)
return true; return true;
} }
bool invoke_openai(const string_vec &prompt, string_vec &reply)
{
reply.clear();
if (!write_text_file("i.txt", prompt, true))
return false;
// Invoke openai.exe
const uint32_t MAX_TRIES = 3;
uint32_t num_tries;
for (num_tries = 0; num_tries < MAX_TRIES; ++num_tries)
{
if (num_tries)
uprintf("openai.exe failed - retrying\n");
int status = system("openai.exe i.txt o.txt");
if (status == EXIT_SUCCESS)
break;
Sleep(2000);
}
if (num_tries == MAX_TRIES)
return false;
// Read output file.
if (!read_text_file("o.txt", reply, true, nullptr))
{
// Wait a bit and try again, rarely needed under Windows.
Sleep(50);
if (!read_text_file("o.txt", reply, true, nullptr))
return false;
}
return true;
}
std::string get_deg_to_dms(double deg) std::string get_deg_to_dms(double deg)
{ {
deg = std::round(fabs(deg) * 3600.0f); deg = std::round(fabs(deg) * 3600.0f);
@ -1186,7 +1225,8 @@ int get_next_utf8_code_point_len(const uint8_t* pStr)
void get_string_words( void get_string_words(
const std::string& str, const std::string& str,
string_vec& words, string_vec& words,
uint_vec* pOffsets_vec) uint_vec* pOffsets_vec,
const char* pAdditional_whitespace)
{ {
const uint8_t* pStr = (const uint8_t *)str.c_str(); const uint8_t* pStr = (const uint8_t *)str.c_str();
@ -1196,7 +1236,9 @@ void get_string_words(
std::string cur_token; std::string cur_token;
const std::string whitespace(" \t\n\r,;:.!?()[]*/\""); std::string whitespace(" \t\n\r,;:.!?()[]*/\"");
if (pAdditional_whitespace)
whitespace += std::string(pAdditional_whitespace);
int word_start_ofs = -1; int word_start_ofs = -1;
@ -1234,7 +1276,7 @@ void get_string_words(
if (pStr[cur_ofs + 2] == 0x93) if (pStr[cur_ofs + 2] == 0x93)
is_whitespace = true; is_whitespace = true;
// dash // dash
if (pStr[cur_ofs + 2] == 0x94) else if (pStr[cur_ofs + 2] == 0x94)
is_whitespace = true; is_whitespace = true;
// left quote // left quote
else if (pStr[cur_ofs + 2] == 0x9C) else if (pStr[cur_ofs + 2] == 0x9C)
@ -1315,3 +1357,369 @@ void get_utf8_code_point_offsets(const char* pStr, int_vec& offsets)
cur_ofs += std::max<int>(1, get_next_utf8_code_point_len((const uint8_t*)pStr + cur_ofs)); cur_ofs += std::max<int>(1, get_next_utf8_code_point_len((const uint8_t*)pStr + cur_ofs));
} }
} }
struct char_map
{
const char32_t* m_pFrom;
const char m_to;
};
static const char_map g_char_norm_up[] =
{
{ U"ÁĂẮẶẰẲẴǍÂẤẬẦẨẪÄǞȦǠẠȀÀẢȂĀĄÅǺḀÃǼǢȺΆ", 'A' },
{ U"ḂḄḆƁƂƄ", 'B' },
{ U"ĆČÇḈĈĊƇȻƆ", 'C' },
{ U"ĎḐḒḊḌḎĐƉƊƋDZDzDŽ", 'D' },
{ U"ÉĔĚȨḜÊẾỆỀỂỄḘËĖẸȄÈẺȆĒḖḔĘẼḚÈÊËĒĔĖĘĚƐƎƏȄȆȨΈΉΕƐƐ", 'E' },
{ U"ḞƑ", 'F' },
{ U"ǴĞǦĢĜĠḠĜĞĠĢƓǤǦǴƔ", 'G' },
{ U"ḪȞḨĤḦḢḤĤĦǶȞΗǶ", 'H' },
{ U"ÍĬǏÎÏḮİỊȈÌỈȊĪĮĨḬÌÍÎÏĨĪĬĮİƗǏȈȊ", 'I' },
{ U"ĴĴ", 'J' },
{ U"ḰǨĶḲḴĶƘǨΚ", 'K' },
{ U"ĹĽĻḼḶḸḺĹĻĽĿŁΛ", 'L' },
{ U"ḾṀṂƜ", 'M' },
{ U"ŃŇŅṊṄṆǸṈÑÑŃŅŇŊƝǸΝ", 'N' },
{ U"ÓŎǑÔỐỘỒỔỖÖȪȮȰỌŐȌÒỎƠỚỢỜỞỠȎŌṒṐǪǬÕṌṎȬǾØÒÓÔÕÖØŌŎŐƟƠǑǪǬǾȌȎȪȬȮȰΌΟΩ", 'O' },
{ U"ṔṖΠΡΦ", 'P' },
{ U"ŔŘŖṘṚṜȐȒṞŔŖŘƦȐȒ", 'R' },
{ U"ŚṤŠṦŞŜȘṠṢṨߌŜŞŠƩȘΣ", 'S' },
{ U"ŤŢṰȚṪṬṮŢŤŦƬƮȚΤ", 'T' },
{ U"ÚŬǓÛṶÜǗǙǛǕṲỤŰȔÙỦƯỨỰỪỬỮȖŪṺŲŮŨṸṴÙÚÛÜŨŪŬŮŰŲƯǓǕǗǙǛȔȖ", 'U' },
{ U"ṾṼƲ", 'V' },
{ U"ẂŴẄẆẈẀŴ", 'W' },
{ U"ẌẊΧΞ", 'X' },
{ U"ÝŶŸẎỴỲỶȲỸÝŶŸƳȲΥΎΫ", 'Y' },
{ U"ŹŽẐŻẒẔŹŻŽƵƷǮȤΖ", 'Z' },
};
static const char_map g_char_norm_lower[] =
{
{ U"áăắặằẳẵǎâấậầẩẫäǟȧǡạȁàảȃāąåǻḁãǽǣⱥάàáâãäåāăąǎǟǡǻȁȃȧάα", 'a' },
{ U"ḃḅḇɓƃƅƀƃβƀƃƅ", 'b' },
{ U"ćčçḉĉċƈȼɔƈçćĉċčƈȼ", 'c' },
{ U"ďḑḓḋḍḏđɖɗƌdzdzdžƌďđƌdzdžȡďđƌdzdžȡ", 'd' },
{ U"éĕěȩḝêếệềểễḙëėẹȅèẻȇēḗḕęẽḛèêëēĕėęěɛǝəȅȇȩέήεɛɛèéêëēĕėęěȅȇȩε", 'e' },
{ U"ḟƒ", 'f' },
{ U"ǵğǧģĝġḡĝğġģɠǥǧǵɣĝğġģǧǵ", 'g' },
{ U"ḫȟḩĥḧḣḥẖĥħƕƕȟƕĥħȟ", 'h' },
{ U"íĭǐîïḯiịȉìỉȋīįĩḭìíîïĩīĭįiɨǐȉȋìíîïĩīĭįǐȉȋι", 'i' },
{ U"ǰĵĵǰĵǰ", 'j' },
{ U"ḱǩķḳḵķƙǩκƙķƙǩκ", 'k' },
{ U"ĺľļḽḷḹḻĺļľŀłƚƛλƚĺļľŀłƚλƚ", 'l' },
{ U"ḿṁṃɯ", 'm' },
{ U"ńňņṋṅṇǹṉññńņňŋɲǹνƞñńņňʼnŋƞǹη", 'n' },
{ U"óŏǒôốộồổỗöȫȯȱọőȍòỏơớợờởỡȏōṓṑǫǭõṍṏȭǿøòóôõöøōŏőɵơǒǫǭǿȍȏȫȭȯȱόοòóôõöøōŏőơǒǫǭǿȍȏȫȭȯȱοσ", 'o' },
{ U"ṕṗπφƥ", 'p' },
{ U"ŕřŗṙṛṝȑȓṟŕŗřʀȑȓρŕŗřȑȓρ", 'r' },
{ U"śṥšṧşŝșṡẛṣṩśŝşšʃșƨśŝşšșƨȿ", 's' },
{ U"ťţṱțẗṫṭṯţťŧƭʈțτƫţťŧƭțτ", 't' },
{ U"úŭǔûṷüǘǚǜǖṳụűȕùủưứựừửữȗūṻųůũṹṵùúûüũūŭůűųưǔǖǘǚǜȕȗưùúûüũūŭůűųưǔǖǘǚǜȕȗμ", 'u' },
{ U"ṿṽʋ", 'v' },
{ U"ẃŵẅẇẉẁẘŵŵω", 'w' },
{ U"ẍẋχξχξ", 'x' },
{ U"ýŷÿẏỵỳỷȳẙỹýŷÿƴȳυύϋƴýÿŷƴȳγψ", 'y' },
{ U"źžẑżẓẕźżžƶʒǯȥζƶźżžƶƹȥζ", 'z' },
};
std::map<int, int> g_upper_trans;
std::map<int, int> g_lower_trans;
static const char* g_stop_words[] =
{
"a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as",
"at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can",
"could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from",
"further", "had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him", "himself",
"his", "how", "i", "if", "in", "into", "is", "it", "its", "itself", "just", "me", "more", "most",
"my", "myself", "no", "nor", "not", "now", "of", "off", "on", "once", "only", "or", "other", "our",
"ours", "ourselves", "out", "over", "own", "re", "same", "she", "should", "so", "some", "such",
"than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they",
"this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "were", "what",
"when", "where", "which", "while", "who", "whom", "why", "will", "with", "you", "your", "yours",
"yourself", "yourselves", "although", "also", "already", "another", "seemed", "seem", "seems"
};
static const uint32_t NUM_STOP_WORDS = (uint32_t)std::size(g_stop_words);
std::set<std::string> g_stop_words_set;
void init_norm()
{
g_stop_words_set.clear();
for (const auto& str : g_stop_words)
g_stop_words_set.insert(str);
for (uint32_t i = 0; i < std::size(g_char_norm_up); i++)
{
const char32_t* pFrom = g_char_norm_up[i].m_pFrom;
char to_char = g_char_norm_up[i].m_to;
while (*pFrom)
{
char32_t fc = *pFrom++;
auto f = g_upper_trans.find(fc);
if (f != g_upper_trans.end())
{
if (f->second != to_char)
{
uprintf("Upper char %u 0x%x is redundant\n", fc, fc);
exit(1);
}
}
g_upper_trans[fc] = to_char;
}
}
for (uint32_t i = 0; i < std::size(g_char_norm_lower); i++)
{
const char32_t* pFrom = g_char_norm_lower[i].m_pFrom;
char to_char = g_char_norm_lower[i].m_to;
while (*pFrom)
{
char32_t fc = *pFrom++;
auto f = g_upper_trans.find(fc);
if (f != g_upper_trans.end())
{
uprintf("Lower char %u 0x%x is in the upper table\n", fc, fc);
if (utolower((uint8_t)f->second) != to_char)
uprintf("Conversion mismatch %u 0x%x\n", fc, fc);
//exit(1);
}
f = g_lower_trans.find(fc);
if (f != g_lower_trans.end())
{
if (f->second != to_char)
{
uprintf("Lower char %u 0x%x is redundant\n", fc, fc);
exit(1);
}
}
g_lower_trans[fc] = to_char;
}
}
}
// Resulting characters are guaranteed to be <128 - useful for searching purposes.
// Unrecognized Unicode characters are deleted.
void normalize_diacritics(const char* pStr, std::string& res)
{
assert(g_stop_words_set.size());
res.resize(0);
while (*pStr)
{
int l = get_next_utf8_code_point_len((const uint8_t*)pStr);
const uint8_t c = *pStr;
utf8_int32_t cp;
char* pStr_next = utf8codepoint(pStr, &cp);
assert((pStr_next - pStr) == l);
if (cp < 128)
{
res.push_back((char)cp);
pStr = pStr_next;
continue;
}
int new_char = -1;
auto u_it = g_upper_trans.find(cp);
auto l_it = g_lower_trans.find(cp);
if (u_it != g_upper_trans.end())
new_char = u_it->second;
else if (l_it != g_lower_trans.end())
new_char = l_it->second;
else
{
// FIXME: this is lame, it parses the utf8 directly.
if ((l == 2) && (c == 0xc2))
{
// NO-BREAK SPACE
if ((uint8_t)pStr[1] == 0xa0)
new_char = ' ';
}
if ((l == 2) && (c == 0xCA))
{
// single left quote
if ((uint8_t)pStr[1] == 0xBB)
new_char = '\'';
}
if ((l == 3) && (c == 0xE2) && ((uint8_t)pStr[1] == 0x80))
{
// dash
if ((uint8_t)pStr[2] == 0x93)
new_char = '-';
// dash
else if ((uint8_t)pStr[2] == 0x94)
new_char = '-';
// left quote
else if ((uint8_t)pStr[2] == 0x9C)
new_char = '"';
// right quote
else if ((uint8_t)pStr[2] == 0x9D)
new_char = '"';
// ellipsis (three dots)
else if ((uint8_t)pStr[2] == 0xA)
new_char = '.';
// ellipsis (three dots)
else if ((uint8_t)pStr[2] == 0xA6)
new_char = '.';
// long dash
else if ((uint8_t)pStr[2] == 9)
new_char = '-';
// left single quote
else if ((uint8_t)pStr[2] == 0x98)
new_char = '\'';
// right single quote
else if ((uint8_t)pStr[2] == 0x99)
new_char = '\'';
// right double quote
else if ((uint8_t)pStr[2] == 0x9D)
new_char = '"';
}
}
// TODO: Do something smarter?
if (new_char != -1)
res.push_back((char)new_char);
pStr = pStr_next;
}
}
std::string normalize_word(const std::string& str)
{
assert(g_stop_words_set.size());
const uint32_t MAX_STRING_SIZE = 4096;
if (str.size() > MAX_STRING_SIZE)
panic("String too long");
char buf[MAX_STRING_SIZE + 1];
strcpy_s(buf, sizeof(buf), str.c_str());
// Convert utf8 string to lower
utf8lwr(buf);
// Remove diacritics and some specials from utf8, this preserves all 1-127 chars
std::string norm;
norm.reserve(strlen(buf));
normalize_diacritics(buf, norm);
// Remove any non-letter or non-digit characters (we assume this is a word, so whitespace gets removed too)
std::string temp;
temp.reserve(norm.size());
for (uint32_t i = 0; i < norm.size(); i++)
{
uint8_t c = norm[i];
c = utolower(c);
if (uislower(c) || uisdigit(c))
temp.push_back(c);
}
// Stem word
strcpy_s(buf, sizeof(buf), temp.c_str());
if (buf[0])
{
int32_t new_len = stem(buf, 0, (int)strlen(buf) - 1);
buf[new_len + 1] = '\0';
}
return buf;
}
// Assumes word is plain ASCII lowercase
bool is_stop_word(const std::string &word)
{
assert(g_stop_words_set.size());
return g_stop_words_set.count(word) != 0;
}
std::string ustrlwr(const std::string& s)
{
const size_t l = s.size();
std::vector<uint8_t> temp;
temp.resize(l + 1);
memcpy(&temp[0], s.c_str(), l);
temp[l] = '\0';
utf8lwr((char *)&temp[0]);
return (char *)&temp[0];
}
std::string string_replace(const std::string& str, const std::string& find, const std::string& repl)
{
assert(find.size());
if (!find.size() || !str.size())
return str;
const uint8_t* pStr = (const uint8_t *)str.c_str();
const size_t str_size = str.size();
const uint8_t* pFind = (const uint8_t*)find.c_str();
const size_t find_size = find.size();
std::string res;
res.reserve(str.size());
size_t str_ofs = 0;
while (str_ofs < str.size())
{
int str_char_size = get_next_utf8_code_point_len(pStr + str_ofs);
if (str_char_size < 0)
{
assert(0);
str_char_size = 1;
}
const size_t str_remaining = str_size - str_ofs;
if ((str_remaining >= find_size) && (memcmp(pStr + str_ofs, pFind, find_size) == 0))
{
res += repl;
str_ofs += find_size;
}
else
{
for (int i = 0; i < str_char_size; i++)
res.push_back((char)pStr[str_ofs + i]);
str_ofs += str_char_size;
}
}
return res;
}
bool does_file_exist(const char* pFilename)
{
FILE* pFile = ufopen(pFilename, "rb");
if (!pFile)
return false;
fclose(pFile);
return true;
}

18
utils.h
View File

@ -240,7 +240,7 @@ bool read_text_file(const char* pFilename, string_vec& lines, bool trim_lines, b
bool read_text_file(const char* pFilename, std::vector<uint8_t>& buf, bool *pUTF8_flag); bool read_text_file(const char* pFilename, std::vector<uint8_t>& buf, bool *pUTF8_flag);
bool write_text_file(const char* pFilename, string_vec& lines, bool utf8_bom = true); bool write_text_file(const char* pFilename, const string_vec& lines, bool utf8_bom = true);
bool serialize_to_json_file(const char* pFilename, const json& j, bool utf8_bom); bool serialize_to_json_file(const char* pFilename, const json& j, bool utf8_bom);
@ -251,6 +251,7 @@ bool invoke_curl(const std::string& args, string_vec& reply);
void convert_args_to_utf8(string_vec& args, int argc, wchar_t* argv[]); void convert_args_to_utf8(string_vec& args, int argc, wchar_t* argv[]);
bool invoke_openai(const std::string& prompt, std::string& reply); bool invoke_openai(const std::string& prompt, std::string& reply);
bool invoke_openai(const string_vec& prompt, string_vec& reply);
std::string get_deg_to_dms(double deg); std::string get_deg_to_dms(double deg);
@ -269,5 +270,16 @@ double geo_distance(double lat1, double lon1, double lat2, double lon2, int unit
std::string remove_bom(std::string str); std::string remove_bom(std::string str);
int get_next_utf8_code_point_len(const uint8_t* pStr); int get_next_utf8_code_point_len(const uint8_t* pStr);
void get_string_words(const std::string& str, string_vec& words, uint_vec* pOffsets_vec); void get_string_words(const std::string& str, string_vec& words, uint_vec* pOffsets_vec, const char *pAdditional_whitespace = nullptr);
void get_utf8_code_point_offsets(const char* pStr, int_vec& offsets); void get_utf8_code_point_offsets(const char* pStr, int_vec& offsets);
void init_norm();
void normalize_diacritics(const char* pStr, std::string& res);
std::string normalize_word(const std::string& str);
bool is_stop_word(const std::string& word);
std::string ustrlwr(const std::string& s);
std::string string_replace(const std::string& str, const std::string& find, const std::string& repl);
bool does_file_exist(const char* pFilename);