mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2024-10-01 01:06:10 -04:00
localdocs: implement .docx support (#2986)
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
parent
ea1ade8668
commit
e190fd0204
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -11,3 +11,6 @@
|
|||||||
[submodule "gpt4all-chat/deps/fmt"]
|
[submodule "gpt4all-chat/deps/fmt"]
|
||||||
path = gpt4all-chat/deps/fmt
|
path = gpt4all-chat/deps/fmt
|
||||||
url = https://github.com/fmtlib/fmt.git
|
url = https://github.com/fmtlib/fmt.git
|
||||||
|
[submodule "gpt4all-chat/deps/DuckX"]
|
||||||
|
path = gpt4all-chat/deps/DuckX
|
||||||
|
url = https://github.com/nomic-ai/DuckX.git
|
||||||
|
@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
|||||||
|
|
||||||
### Added
|
### Added
|
||||||
- Add bm25 hybrid search to localdocs ([#2969](https://github.com/nomic-ai/gpt4all/pull/2969))
|
- Add bm25 hybrid search to localdocs ([#2969](https://github.com/nomic-ai/gpt4all/pull/2969))
|
||||||
|
- LocalDocs support for .docx files ([#2986](https://github.com/nomic-ai/gpt4all/pull/2986))
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
- Rebase llama.cpp on latest upstream as of September 26th ([#2998](https://github.com/nomic-ai/gpt4all/pull/2998))
|
- Rebase llama.cpp on latest upstream as of September 26th ([#2998](https://github.com/nomic-ai/gpt4all/pull/2998))
|
||||||
|
@ -86,14 +86,9 @@ get_filename_component(Qt6_ROOT_DIR "${Qt6_ROOT_DIR}/.." ABSOLUTE)
|
|||||||
message(STATUS "qmake binary: ${QMAKE_EXECUTABLE}")
|
message(STATUS "qmake binary: ${QMAKE_EXECUTABLE}")
|
||||||
message(STATUS "Qt 6 root directory: ${Qt6_ROOT_DIR}")
|
message(STATUS "Qt 6 root directory: ${Qt6_ROOT_DIR}")
|
||||||
|
|
||||||
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||||
|
|
||||||
set(FMT_INSTALL OFF)
|
|
||||||
set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}")
|
|
||||||
set(BUILD_SHARED_LIBS OFF)
|
|
||||||
add_subdirectory(deps/fmt)
|
|
||||||
set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}")
|
|
||||||
|
|
||||||
|
add_subdirectory(deps)
|
||||||
add_subdirectory(../gpt4all-backend llmodel)
|
add_subdirectory(../gpt4all-backend llmodel)
|
||||||
|
|
||||||
set(CHAT_EXE_RESOURCES)
|
set(CHAT_EXE_RESOURCES)
|
||||||
@ -133,9 +128,6 @@ if (APPLE)
|
|||||||
list(APPEND CHAT_EXE_RESOURCES "${LOCAL_EMBEDDING_MODEL_PATH}")
|
list(APPEND CHAT_EXE_RESOURCES "${LOCAL_EMBEDDING_MODEL_PATH}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(QAPPLICATION_CLASS QGuiApplication)
|
|
||||||
add_subdirectory(deps/SingleApplication)
|
|
||||||
|
|
||||||
if (DEFINED GGML_METALLIB)
|
if (DEFINED GGML_METALLIB)
|
||||||
set_source_files_properties("${GGML_METALLIB}" PROPERTIES GENERATED ON)
|
set_source_files_properties("${GGML_METALLIB}" PROPERTIES GENERATED ON)
|
||||||
endif()
|
endif()
|
||||||
@ -335,7 +327,7 @@ target_include_directories(chat PRIVATE deps/usearch/include
|
|||||||
target_link_libraries(chat
|
target_link_libraries(chat
|
||||||
PRIVATE Qt6::Core Qt6::HttpServer Qt6::Pdf Qt6::Quick Qt6::Sql Qt6::Svg)
|
PRIVATE Qt6::Core Qt6::HttpServer Qt6::Pdf Qt6::Quick Qt6::Sql Qt6::Svg)
|
||||||
target_link_libraries(chat
|
target_link_libraries(chat
|
||||||
PRIVATE llmodel SingleApplication fmt::fmt)
|
PRIVATE llmodel SingleApplication fmt::fmt duckx::duckx)
|
||||||
|
|
||||||
|
|
||||||
# -- install --
|
# -- install --
|
||||||
|
10
gpt4all-chat/deps/CMakeLists.txt
Normal file
10
gpt4all-chat/deps/CMakeLists.txt
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
set(BUILD_SHARED_LIBS OFF)
|
||||||
|
|
||||||
|
set(FMT_INSTALL OFF)
|
||||||
|
add_subdirectory(fmt)
|
||||||
|
|
||||||
|
set(QAPPLICATION_CLASS QGuiApplication)
|
||||||
|
add_subdirectory(SingleApplication)
|
||||||
|
|
||||||
|
set(DUCKX_INSTALL OFF)
|
||||||
|
add_subdirectory(DuckX)
|
1
gpt4all-chat/deps/DuckX
Submodule
1
gpt4all-chat/deps/DuckX
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 6e31dfb280e2107fbf4f6a15098c38b014f1bbcc
|
@ -1 +1 @@
|
|||||||
Subproject commit 22cfa3bd00ea542132ee826cdb220f9d6434bd43
|
Subproject commit 1f0618a86f9dbb7386237241cee96cc425dd7b55
|
@ -70,7 +70,7 @@ MySettingsTab {
|
|||||||
/* Blacklist common unsupported file extensions. We only support plain text and PDFs, and although we
|
/* Blacklist common unsupported file extensions. We only support plain text and PDFs, and although we
|
||||||
* reject binary data, we don't want to waste time trying to index files that we don't support. */
|
* reject binary data, we don't want to waste time trying to index files that we don't support. */
|
||||||
exts = exts.filter(e => ![
|
exts = exts.filter(e => ![
|
||||||
/* Microsoft documents */ "rtf", "docx", "ppt", "pptx", "xls", "xlsx",
|
/* Microsoft documents */ "rtf", "ppt", "pptx", "xls", "xlsx",
|
||||||
/* OpenOffice */ "odt", "ods", "odp", "odg",
|
/* OpenOffice */ "odt", "ods", "odp", "odg",
|
||||||
/* photos */ "jpg", "jpeg", "png", "gif", "bmp", "tif", "tiff", "webp",
|
/* photos */ "jpg", "jpeg", "png", "gif", "bmp", "tif", "tiff", "webp",
|
||||||
/* audio */ "mp3", "wma", "m4a", "wav", "flac",
|
/* audio */ "mp3", "wma", "m4a", "wav", "flac",
|
||||||
|
@ -1,13 +1,15 @@
|
|||||||
#include "database.h"
|
#include "database.h"
|
||||||
|
|
||||||
#include "mysettings.h"
|
#include "mysettings.h"
|
||||||
|
#include "utils.h"
|
||||||
|
|
||||||
|
#include <duckx/duckx.hpp>
|
||||||
|
#include <fmt/format.h>
|
||||||
#include <usearch/index_plugins.hpp>
|
#include <usearch/index_plugins.hpp>
|
||||||
|
|
||||||
#include <QDebug>
|
#include <QDebug>
|
||||||
#include <QDir>
|
#include <QDir>
|
||||||
#include <QDirIterator>
|
#include <QDirIterator>
|
||||||
#include <QElapsedTimer>
|
|
||||||
#include <QFile>
|
#include <QFile>
|
||||||
#include <QFileSystemWatcher>
|
#include <QFileSystemWatcher>
|
||||||
#include <QIODevice>
|
#include <QIODevice>
|
||||||
@ -18,16 +20,16 @@
|
|||||||
#include <QSqlQuery>
|
#include <QSqlQuery>
|
||||||
#include <QTextStream>
|
#include <QTextStream>
|
||||||
#include <QTimer>
|
#include <QTimer>
|
||||||
|
#include <QMap>
|
||||||
|
#include <QUtf8StringView>
|
||||||
#include <QVariant>
|
#include <QVariant>
|
||||||
#include <Qt>
|
#include <Qt>
|
||||||
#include <QtGlobal>
|
|
||||||
#include <QtLogging>
|
#include <QtLogging>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <optional>
|
#include <optional>
|
||||||
#include <utility>
|
#include <stdexcept>
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
using namespace Qt::Literals::StringLiterals;
|
using namespace Qt::Literals::StringLiterals;
|
||||||
namespace us = unum::usearch;
|
namespace us = unum::usearch;
|
||||||
@ -991,10 +993,11 @@ Database::Database(int chunkSize, QStringList extensions)
|
|||||||
: QObject(nullptr)
|
: QObject(nullptr)
|
||||||
, m_chunkSize(chunkSize)
|
, m_chunkSize(chunkSize)
|
||||||
, m_scannedFileExtensions(std::move(extensions))
|
, m_scannedFileExtensions(std::move(extensions))
|
||||||
, m_scanTimer(new QTimer(this))
|
, m_scanIntervalTimer(new QTimer(this))
|
||||||
, m_watcher(new QFileSystemWatcher(this))
|
, m_watcher(new QFileSystemWatcher(this))
|
||||||
, m_embLLM(new EmbeddingLLM)
|
, m_embLLM(new EmbeddingLLM)
|
||||||
, m_databaseValid(true)
|
, m_databaseValid(true)
|
||||||
|
, m_chunkStreamer(this)
|
||||||
{
|
{
|
||||||
m_db = QSqlDatabase::database(QSqlDatabase::defaultConnection, false);
|
m_db = QSqlDatabase::database(QSqlDatabase::defaultConnection, false);
|
||||||
if (!m_db.isValid())
|
if (!m_db.isValid())
|
||||||
@ -1080,87 +1083,345 @@ void Database::updateFolderToIndex(int folder_id, size_t countForFolder, bool se
|
|||||||
updateGuiForCollectionItem(item);
|
updateGuiForCollectionItem(item);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Database::handleDocumentError(const QString &errorMessage,
|
static void handleDocumentError(const QString &errorMessage, int document_id, const QString &document_path,
|
||||||
int document_id, const QString &document_path, const QSqlError &error)
|
const QSqlError &error)
|
||||||
{
|
{
|
||||||
qWarning() << errorMessage << document_id << document_path << error;
|
qWarning() << errorMessage << document_id << document_path << error;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t Database::chunkStream(QTextStream &stream, int folder_id, int document_id, const QString &embedding_model,
|
class DocumentReader {
|
||||||
const QString &file, const QString &title, const QString &author, const QString &subject, const QString &keywords,
|
public:
|
||||||
int page, int maxChunks)
|
static std::unique_ptr<DocumentReader> fromDocument(const DocumentInfo &info);
|
||||||
|
|
||||||
|
const DocumentInfo &doc () const { return *m_info; }
|
||||||
|
const std::optional<QString> &word () const { return m_word; }
|
||||||
|
const std::optional<QString> &nextWord() { m_word = advance(); return m_word; }
|
||||||
|
virtual std::optional<ChunkStreamer::Status> getError() const { return std::nullopt; }
|
||||||
|
virtual int page() const { return -1; }
|
||||||
|
|
||||||
|
virtual ~DocumentReader() = default;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
explicit DocumentReader(const DocumentInfo &info)
|
||||||
|
: m_info(&info) {}
|
||||||
|
|
||||||
|
void postInit() { m_word = advance(); }
|
||||||
|
|
||||||
|
virtual std::optional<QString> advance() = 0;
|
||||||
|
|
||||||
|
const DocumentInfo *m_info;
|
||||||
|
std::optional<QString> m_word;
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
class PdfDocumentReader final : public DocumentReader {
|
||||||
|
public:
|
||||||
|
explicit PdfDocumentReader(const DocumentInfo &info)
|
||||||
|
: DocumentReader(info)
|
||||||
|
{
|
||||||
|
QString path = info.file.canonicalFilePath();
|
||||||
|
if (m_doc.load(path) != QPdfDocument::Error::None)
|
||||||
|
throw std::runtime_error(fmt::format("Failed to load PDF: {}", path));
|
||||||
|
postInit();
|
||||||
|
}
|
||||||
|
|
||||||
|
int page() const override { return m_currentPage; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::optional<QString> advance() override
|
||||||
|
{
|
||||||
|
QString word;
|
||||||
|
do {
|
||||||
|
while (!m_stream || m_stream->atEnd()) {
|
||||||
|
if (m_currentPage >= m_doc.pageCount())
|
||||||
|
return std::nullopt;
|
||||||
|
m_pageText = m_doc.getAllText(m_currentPage++).text();
|
||||||
|
m_stream.emplace(&m_pageText);
|
||||||
|
}
|
||||||
|
*m_stream >> word;
|
||||||
|
} while (word.isEmpty());
|
||||||
|
return word;
|
||||||
|
}
|
||||||
|
|
||||||
|
QPdfDocument m_doc;
|
||||||
|
int m_currentPage = 0;
|
||||||
|
QString m_pageText;
|
||||||
|
std::optional<QTextStream> m_stream;
|
||||||
|
};
|
||||||
|
|
||||||
|
class WordDocumentReader final : public DocumentReader {
|
||||||
|
public:
|
||||||
|
explicit WordDocumentReader(const DocumentInfo &info)
|
||||||
|
: DocumentReader(info)
|
||||||
|
, m_doc(info.file.canonicalFilePath().toStdString())
|
||||||
|
{
|
||||||
|
m_doc.open();
|
||||||
|
if (!m_doc.is_open())
|
||||||
|
throw std::runtime_error(fmt::format("Failed to open DOCX: {}", info.file.canonicalFilePath()));
|
||||||
|
|
||||||
|
m_paragraph = &m_doc.paragraphs();
|
||||||
|
m_run = &m_paragraph->runs();
|
||||||
|
postInit();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
std::optional<QString> advance() override
|
||||||
|
{
|
||||||
|
// find non-space char
|
||||||
|
qsizetype wordStart = 0;
|
||||||
|
while (m_buffer.isEmpty() || m_buffer[wordStart].isSpace()) {
|
||||||
|
if (m_buffer.isEmpty() && !fillBuffer())
|
||||||
|
return std::nullopt;
|
||||||
|
if (m_buffer[wordStart].isSpace() && ++wordStart >= m_buffer.size()) {
|
||||||
|
m_buffer.clear();
|
||||||
|
wordStart = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// find space char
|
||||||
|
qsizetype wordEnd = wordStart + 1;
|
||||||
|
while (wordEnd >= m_buffer.size() || !m_buffer[wordEnd].isSpace()) {
|
||||||
|
if (wordEnd >= m_buffer.size() && !fillBuffer())
|
||||||
|
return std::nullopt;
|
||||||
|
if (!m_buffer[wordEnd].isSpace())
|
||||||
|
++wordEnd;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto size = wordEnd - wordStart;
|
||||||
|
QString word = std::move(m_buffer);
|
||||||
|
m_buffer = word.sliced(wordStart + size);
|
||||||
|
if (wordStart == 0)
|
||||||
|
word.resize(size);
|
||||||
|
else
|
||||||
|
word = word.sliced(wordStart, size);
|
||||||
|
|
||||||
|
return word;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool fillBuffer()
|
||||||
|
{
|
||||||
|
for (;;) {
|
||||||
|
// get a run
|
||||||
|
while (!m_run->has_next()) {
|
||||||
|
// try next paragraph
|
||||||
|
if (!m_paragraph->has_next())
|
||||||
|
return false;
|
||||||
|
m_paragraph->next();
|
||||||
|
m_buffer += u'\n';
|
||||||
|
}
|
||||||
|
auto &run = m_run->get_node();
|
||||||
|
const char *text = run.child("w:t").text().get();
|
||||||
|
if (!*text && run.child("w:tab"))
|
||||||
|
text = "\t";
|
||||||
|
m_run->next();
|
||||||
|
if (*text) {
|
||||||
|
m_buffer += QUtf8StringView(text);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
duckx::Document m_doc;
|
||||||
|
duckx::Paragraph *m_paragraph;
|
||||||
|
duckx::Run *m_run;
|
||||||
|
QString m_buffer;
|
||||||
|
};
|
||||||
|
|
||||||
|
class TxtDocumentReader final : public DocumentReader {
|
||||||
|
public:
|
||||||
|
explicit TxtDocumentReader(const DocumentInfo &info)
|
||||||
|
: DocumentReader(info)
|
||||||
|
, m_file(info.file.canonicalFilePath())
|
||||||
|
{
|
||||||
|
if (!m_file.open(QIODevice::ReadOnly))
|
||||||
|
throw std::runtime_error(fmt::format("Failed to open text file: {}", m_file.fileName()));
|
||||||
|
|
||||||
|
m_stream.setDevice(&m_file);
|
||||||
|
postInit();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
std::optional<QString> advance() override
|
||||||
|
{
|
||||||
|
while (!m_stream.atEnd()) {
|
||||||
|
QString word;
|
||||||
|
m_stream >> word;
|
||||||
|
if (!word.isEmpty())
|
||||||
|
return word;
|
||||||
|
}
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::optional<ChunkStreamer::Status> getError() const override
|
||||||
|
{
|
||||||
|
if (!m_file.error())
|
||||||
|
return std::nullopt;
|
||||||
|
return m_file.binarySeen() ? ChunkStreamer::Status::BINARY_SEEN : ChunkStreamer::Status::ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
BinaryDetectingFile m_file;
|
||||||
|
QTextStream m_stream;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
std::unique_ptr<DocumentReader> DocumentReader::fromDocument(const DocumentInfo &doc)
|
||||||
|
{
|
||||||
|
if (doc.isPdf())
|
||||||
|
return std::make_unique<PdfDocumentReader>(doc);
|
||||||
|
if (doc.isDocx())
|
||||||
|
return std::make_unique<WordDocumentReader>(doc);
|
||||||
|
return std::make_unique<TxtDocumentReader>(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
ChunkStreamer::ChunkStreamer(Database *database)
|
||||||
|
: m_database(database) {}
|
||||||
|
|
||||||
|
ChunkStreamer::~ChunkStreamer() = default;
|
||||||
|
|
||||||
|
void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel,
|
||||||
|
const QString &title, const QString &author, const QString &subject,
|
||||||
|
const QString &keywords)
|
||||||
|
{
|
||||||
|
auto docKey = doc.key();
|
||||||
|
if (!m_docKey || *m_docKey != docKey) {
|
||||||
|
m_docKey = docKey;
|
||||||
|
m_reader = DocumentReader::fromDocument(doc);
|
||||||
|
m_documentId = documentId;
|
||||||
|
m_embeddingModel = embeddingModel;
|
||||||
|
m_title = title;
|
||||||
|
m_author = author;
|
||||||
|
m_subject = subject;
|
||||||
|
m_keywords = keywords;
|
||||||
|
m_chunk.clear();
|
||||||
|
m_page = 0;
|
||||||
|
|
||||||
|
// make sure the document doesn't already have any chunks
|
||||||
|
QSqlQuery q(m_database->m_db);
|
||||||
|
if (!removeChunksByDocumentId(q, documentId))
|
||||||
|
handleDocumentError("ERROR: Cannot remove chunks of document", documentId, doc.file.canonicalPath(), q.lastError());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ChunkStreamer::Status ChunkStreamer::step()
|
||||||
{
|
{
|
||||||
int charCount = 0;
|
|
||||||
// TODO: implement line_from/line_to
|
// TODO: implement line_from/line_to
|
||||||
constexpr int line_from = -1;
|
constexpr int line_from = -1;
|
||||||
constexpr int line_to = -1;
|
constexpr int line_to = -1;
|
||||||
QList<QString> words;
|
const int folderId = m_reader->doc().folder;
|
||||||
int chunks = 0;
|
const int maxChunkSize = m_database->m_chunkSize;
|
||||||
int addedWords = 0;
|
int nChunks = 0;
|
||||||
|
int nAddedWords = 0;
|
||||||
|
Status retval;
|
||||||
|
|
||||||
for (;;) {
|
for (;;) {
|
||||||
QString word;
|
if (auto error = m_reader->getError())
|
||||||
stream >> word;
|
return *error;
|
||||||
if (stream.status() && !stream.atEnd())
|
if (m_database->scanQueueInterrupted()) {
|
||||||
return -1;
|
retval = Status::INTERRUPTED;
|
||||||
charCount += word.length();
|
break;
|
||||||
if (!word.isEmpty())
|
}
|
||||||
words.append(word);
|
|
||||||
if (stream.status() || charCount + words.size() - 1 >= m_chunkSize) {
|
// get a word, if needed
|
||||||
if (!words.isEmpty()) {
|
std::optional<QString> word = QString(); // empty string to disable EOF logic
|
||||||
const QString chunk = words.join(" ");
|
if (m_chunk.length() < maxChunkSize + 1) {
|
||||||
QSqlQuery q(m_db);
|
word = m_reader->word();
|
||||||
int chunk_id = 0;
|
if (m_chunk.isEmpty())
|
||||||
|
m_page = m_reader->page(); // page number of first word
|
||||||
|
|
||||||
|
if (word) {
|
||||||
|
m_chunk += *word;
|
||||||
|
m_chunk += u' ';
|
||||||
|
m_reader->nextWord();
|
||||||
|
m_nChunkWords++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!word || m_chunk.length() >= maxChunkSize + 1) { // +1 for leading space
|
||||||
|
if (!m_chunk.isEmpty()) {
|
||||||
|
int nThisChunkWords = 0;
|
||||||
|
auto chunk = m_chunk; // copy
|
||||||
|
|
||||||
|
// handle overlength chunks
|
||||||
|
if (m_chunk.length() > maxChunkSize + 1) {
|
||||||
|
// find the final space
|
||||||
|
qsizetype lastSpace = chunk.lastIndexOf(u' ', -2);
|
||||||
|
|
||||||
|
if (lastSpace < 0) {
|
||||||
|
// slice off the last word
|
||||||
|
Q_ASSERT(m_nChunkWords >= 1);
|
||||||
|
lastSpace = maxChunkSize;
|
||||||
|
nThisChunkWords = m_nChunkWords - 1;
|
||||||
|
m_nChunkWords = 1;
|
||||||
|
} else {
|
||||||
|
// slice the overlong word
|
||||||
|
nThisChunkWords = m_nChunkWords;
|
||||||
|
m_nChunkWords = 0;
|
||||||
|
}
|
||||||
|
// save the extra part
|
||||||
|
m_chunk = chunk.sliced(lastSpace + 1);
|
||||||
|
// slice
|
||||||
|
chunk.truncate(lastSpace + 1);
|
||||||
|
Q_ASSERT(chunk.length() <= maxChunkSize + 1);
|
||||||
|
} else {
|
||||||
|
nThisChunkWords = m_nChunkWords;
|
||||||
|
m_nChunkWords = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
QSqlQuery q(m_database->m_db);
|
||||||
|
int chunkId = 0;
|
||||||
if (!addChunk(q,
|
if (!addChunk(q,
|
||||||
document_id,
|
m_documentId,
|
||||||
chunk,
|
chunk.chopped(1), // strip trailing space
|
||||||
file,
|
m_reader->doc().file.canonicalFilePath(),
|
||||||
title,
|
m_title,
|
||||||
author,
|
m_author,
|
||||||
subject,
|
m_subject,
|
||||||
keywords,
|
m_keywords,
|
||||||
page,
|
m_page,
|
||||||
line_from,
|
line_from,
|
||||||
line_to,
|
line_to,
|
||||||
words.size(),
|
nThisChunkWords,
|
||||||
&chunk_id
|
&chunkId
|
||||||
)) {
|
)) {
|
||||||
qWarning() << "ERROR: Could not insert chunk into db" << q.lastError();
|
qWarning() << "ERROR: Could not insert chunk into db" << q.lastError();
|
||||||
}
|
}
|
||||||
|
|
||||||
addedWords += words.size();
|
nAddedWords += nThisChunkWords;
|
||||||
|
|
||||||
EmbeddingChunk toEmbed;
|
EmbeddingChunk toEmbed;
|
||||||
toEmbed.model = embedding_model;
|
toEmbed.model = m_embeddingModel;
|
||||||
toEmbed.folder_id = folder_id;
|
toEmbed.folder_id = folderId;
|
||||||
toEmbed.chunk_id = chunk_id;
|
toEmbed.chunk_id = chunkId;
|
||||||
toEmbed.chunk = chunk;
|
toEmbed.chunk = chunk;
|
||||||
appendChunk(toEmbed);
|
m_database->appendChunk(toEmbed);
|
||||||
++chunks;
|
++nChunks;
|
||||||
|
|
||||||
words.clear();
|
m_chunk.clear();
|
||||||
charCount = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (stream.status() || (maxChunks > 0 && chunks == maxChunks))
|
if (!word) {
|
||||||
|
retval = Status::DOC_COMPLETE;
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (chunks) {
|
if (nChunks) {
|
||||||
CollectionItem item = guiCollectionItem(folder_id);
|
CollectionItem item = m_database->guiCollectionItem(folderId);
|
||||||
|
|
||||||
// Set the start update if we haven't done so already
|
// Set the start update if we haven't done so already
|
||||||
if (item.startUpdate <= item.lastUpdate && item.currentEmbeddingsToIndex == 0)
|
if (item.startUpdate <= item.lastUpdate && item.currentEmbeddingsToIndex == 0)
|
||||||
setStartUpdateTime(item);
|
m_database->setStartUpdateTime(item);
|
||||||
|
|
||||||
item.currentEmbeddingsToIndex += chunks;
|
item.currentEmbeddingsToIndex += nChunks;
|
||||||
item.totalEmbeddingsToIndex += chunks;
|
item.totalEmbeddingsToIndex += nChunks;
|
||||||
item.totalWords += addedWords;
|
item.totalWords += nAddedWords;
|
||||||
updateGuiForCollectionItem(item);
|
m_database->updateGuiForCollectionItem(item);
|
||||||
}
|
}
|
||||||
|
|
||||||
return stream.pos();
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Database::appendChunk(const EmbeddingChunk &chunk)
|
void Database::appendChunk(const EmbeddingChunk &chunk)
|
||||||
@ -1238,83 +1499,82 @@ void Database::handleErrorGenerated(const QVector<EmbeddingChunk> &chunks, const
|
|||||||
|
|
||||||
size_t Database::countOfDocuments(int folder_id) const
|
size_t Database::countOfDocuments(int folder_id) const
|
||||||
{
|
{
|
||||||
if (!m_docsToScan.contains(folder_id))
|
if (auto it = m_docsToScan.find(folder_id); it != m_docsToScan.end())
|
||||||
return 0;
|
return it->second.size();
|
||||||
return m_docsToScan.value(folder_id).size();
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t Database::countOfBytes(int folder_id) const
|
size_t Database::countOfBytes(int folder_id) const
|
||||||
{
|
{
|
||||||
if (!m_docsToScan.contains(folder_id))
|
if (auto it = m_docsToScan.find(folder_id); it != m_docsToScan.end()) {
|
||||||
return 0;
|
size_t totalBytes = 0;
|
||||||
size_t totalBytes = 0;
|
for (const DocumentInfo &f : it->second)
|
||||||
const QQueue<DocumentInfo> &docs = m_docsToScan.value(folder_id);
|
totalBytes += f.file.size();
|
||||||
for (const DocumentInfo &f : docs)
|
return totalBytes;
|
||||||
totalBytes += f.doc.size();
|
}
|
||||||
return totalBytes;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
DocumentInfo Database::dequeueDocument()
|
DocumentInfo Database::dequeueDocument()
|
||||||
{
|
{
|
||||||
Q_ASSERT(!m_docsToScan.isEmpty());
|
Q_ASSERT(!m_docsToScan.empty());
|
||||||
const int firstKey = m_docsToScan.firstKey();
|
auto firstEntry = m_docsToScan.begin();
|
||||||
QQueue<DocumentInfo> &queue = m_docsToScan[firstKey];
|
auto &[firstKey, queue] = *firstEntry;
|
||||||
Q_ASSERT(!queue.isEmpty());
|
Q_ASSERT(!queue.empty());
|
||||||
DocumentInfo result = queue.dequeue();
|
DocumentInfo result = std::move(queue.front());
|
||||||
if (queue.isEmpty())
|
queue.pop_front();
|
||||||
m_docsToScan.remove(firstKey);
|
if (queue.empty())
|
||||||
|
m_docsToScan.erase(firstEntry);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Database::removeFolderFromDocumentQueue(int folder_id)
|
void Database::removeFolderFromDocumentQueue(int folder_id)
|
||||||
{
|
{
|
||||||
if (!m_docsToScan.contains(folder_id))
|
if (auto it = m_docsToScan.find(folder_id); it != m_docsToScan.end())
|
||||||
return;
|
m_docsToScan.erase(it);
|
||||||
m_docsToScan.remove(folder_id);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Database::enqueueDocumentInternal(const DocumentInfo &info, bool prepend)
|
void Database::enqueueDocumentInternal(DocumentInfo &&info, bool prepend)
|
||||||
{
|
{
|
||||||
const int key = info.folder;
|
auto &queue = m_docsToScan[info.folder];
|
||||||
if (!m_docsToScan.contains(key))
|
queue.insert(prepend ? queue.begin() : queue.end(), std::move(info));
|
||||||
m_docsToScan[key] = QQueue<DocumentInfo>();
|
|
||||||
if (prepend)
|
|
||||||
m_docsToScan[key].prepend(info);
|
|
||||||
else
|
|
||||||
m_docsToScan[key].enqueue(info);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Database::enqueueDocuments(int folder_id, const QVector<DocumentInfo> &infos)
|
void Database::enqueueDocuments(int folder_id, std::list<DocumentInfo> &&infos)
|
||||||
{
|
{
|
||||||
for (int i = 0; i < infos.size(); ++i)
|
// enqueue all documents
|
||||||
enqueueDocumentInternal(infos[i]);
|
auto &queue = m_docsToScan[folder_id];
|
||||||
const size_t count = countOfDocuments(folder_id);
|
queue.splice(queue.end(), std::move(infos));
|
||||||
|
|
||||||
CollectionItem item = guiCollectionItem(folder_id);
|
CollectionItem item = guiCollectionItem(folder_id);
|
||||||
item.currentDocsToIndex = count;
|
item.currentDocsToIndex = queue.size();
|
||||||
item.totalDocsToIndex = count;
|
item.totalDocsToIndex = queue.size();
|
||||||
const size_t bytes = countOfBytes(folder_id);
|
const size_t bytes = countOfBytes(folder_id);
|
||||||
item.currentBytesToIndex = bytes;
|
item.currentBytesToIndex = bytes;
|
||||||
item.totalBytesToIndex = bytes;
|
item.totalBytesToIndex = bytes;
|
||||||
updateGuiForCollectionItem(item);
|
updateGuiForCollectionItem(item);
|
||||||
m_scanTimer->start();
|
m_scanIntervalTimer->start();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Database::scanQueueInterrupted() const
|
||||||
|
{
|
||||||
|
return m_scanDurationTimer.elapsed() >= 100;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Database::scanQueueBatch()
|
void Database::scanQueueBatch()
|
||||||
{
|
{
|
||||||
QElapsedTimer timer;
|
m_scanDurationTimer.start();
|
||||||
timer.start();
|
|
||||||
|
|
||||||
transaction();
|
transaction();
|
||||||
|
|
||||||
// scan for up to 100ms or until we run out of documents
|
// scan for up to 100ms or until we run out of documents
|
||||||
while (!m_docsToScan.isEmpty() && timer.elapsed() < 100)
|
while (!m_docsToScan.empty() && !scanQueueInterrupted())
|
||||||
scanQueue();
|
scanQueue();
|
||||||
|
|
||||||
commit();
|
commit();
|
||||||
|
|
||||||
if (m_docsToScan.isEmpty())
|
if (m_docsToScan.empty())
|
||||||
m_scanTimer->stop();
|
m_scanIntervalTimer->stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
void Database::scanQueue()
|
void Database::scanQueue()
|
||||||
@ -1324,15 +1584,15 @@ void Database::scanQueue()
|
|||||||
const int folder_id = info.folder;
|
const int folder_id = info.folder;
|
||||||
|
|
||||||
// Update info
|
// Update info
|
||||||
info.doc.stat();
|
info.file.stat();
|
||||||
|
|
||||||
// If the doc has since been deleted or no longer readable, then we schedule more work and return
|
// If the doc has since been deleted or no longer readable, then we schedule more work and return
|
||||||
// leaving the cleanup for the cleanup handler
|
// leaving the cleanup for the cleanup handler
|
||||||
if (!info.doc.exists() || !info.doc.isReadable())
|
if (!info.file.exists() || !info.file.isReadable())
|
||||||
return updateFolderToIndex(folder_id, countForFolder);
|
return updateFolderToIndex(folder_id, countForFolder);
|
||||||
|
|
||||||
const qint64 document_time = info.doc.fileTime(QFile::FileModificationTime).toMSecsSinceEpoch();
|
const qint64 document_time = info.file.fileTime(QFile::FileModificationTime).toMSecsSinceEpoch();
|
||||||
const QString document_path = info.doc.canonicalFilePath();
|
const QString document_path = info.file.canonicalFilePath();
|
||||||
const bool currentlyProcessing = info.currentlyProcessing;
|
const bool currentlyProcessing = info.currentlyProcessing;
|
||||||
|
|
||||||
// Check and see if we already have this document
|
// Check and see if we already have this document
|
||||||
@ -1393,104 +1653,57 @@ void Database::scanQueue()
|
|||||||
}
|
}
|
||||||
|
|
||||||
Q_ASSERT(document_id != -1);
|
Q_ASSERT(document_id != -1);
|
||||||
if (info.isPdf()) {
|
|
||||||
QPdfDocument doc;
|
|
||||||
if (QPdfDocument::Error::None != doc.load(info.doc.canonicalFilePath())) {
|
|
||||||
handleDocumentError("ERROR: Could not load pdf",
|
|
||||||
document_id, document_path, q.lastError());
|
|
||||||
return updateFolderToIndex(folder_id, countForFolder);
|
|
||||||
}
|
|
||||||
const size_t bytes = info.doc.size();
|
|
||||||
const size_t bytesPerPage = std::floor(bytes / doc.pageCount());
|
|
||||||
const int pageIndex = info.currentPage;
|
|
||||||
#if defined(DEBUG)
|
|
||||||
qDebug() << "scanning page" << pageIndex << "of" << doc.pageCount() << document_path;
|
|
||||||
#endif
|
|
||||||
const QPdfSelection selection = doc.getAllText(pageIndex);
|
|
||||||
QString text = selection.text();
|
|
||||||
QTextStream stream(&text);
|
|
||||||
chunkStream(stream, info.folder, document_id, embedding_model, info.doc.fileName(),
|
|
||||||
doc.metaData(QPdfDocument::MetaDataField::Title).toString(),
|
|
||||||
doc.metaData(QPdfDocument::MetaDataField::Author).toString(),
|
|
||||||
doc.metaData(QPdfDocument::MetaDataField::Subject).toString(),
|
|
||||||
doc.metaData(QPdfDocument::MetaDataField::Keywords).toString(),
|
|
||||||
pageIndex + 1
|
|
||||||
);
|
|
||||||
CollectionItem item = guiCollectionItem(info.folder);
|
|
||||||
item.currentBytesToIndex -= bytesPerPage;
|
|
||||||
updateGuiForCollectionItem(item);
|
|
||||||
if (info.currentPage < doc.pageCount()) {
|
|
||||||
info.currentPage += 1;
|
|
||||||
info.currentlyProcessing = true;
|
|
||||||
enqueueDocumentInternal(info, true /*prepend*/);
|
|
||||||
return updateFolderToIndex(folder_id, countForFolder + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
item.currentBytesToIndex -= bytes - (bytesPerPage * doc.pageCount());
|
{
|
||||||
updateGuiForCollectionItem(item);
|
QString title, author, subject, keywords;
|
||||||
} else {
|
if (info.isPdf()) {
|
||||||
BinaryDetectingFile file(document_path);
|
QPdfDocument doc;
|
||||||
if (!file.open(QIODevice::ReadOnly)) {
|
if (doc.load(document_path) != QPdfDocument::Error::None) {
|
||||||
handleDocumentError("ERROR: Cannot open file for scanning",
|
qWarning() << "ERROR: Could not load pdf" << document_id << document_path;;
|
||||||
existing_id, document_path, q.lastError());
|
|
||||||
return updateFolderToIndex(folder_id, countForFolder);
|
|
||||||
}
|
|
||||||
Q_ASSERT(!file.isSequential()); // we need to seek
|
|
||||||
|
|
||||||
const size_t bytes = info.doc.size();
|
|
||||||
QTextStream stream(&file);
|
|
||||||
const size_t byteIndex = info.currentPosition;
|
|
||||||
if (byteIndex) {
|
|
||||||
/* Read the Unicode BOM to detect the encoding. Without this, QTextStream will
|
|
||||||
* always interpret the text as UTF-8 when byteIndex is nonzero. */
|
|
||||||
stream.read(1);
|
|
||||||
|
|
||||||
if (!stream.seek(byteIndex)) {
|
|
||||||
handleDocumentError("ERROR: Cannot seek to pos for scanning",
|
|
||||||
existing_id, document_path, q.lastError());
|
|
||||||
return updateFolderToIndex(folder_id, countForFolder);
|
return updateFolderToIndex(folder_id, countForFolder);
|
||||||
}
|
}
|
||||||
|
title = doc.metaData(QPdfDocument::MetaDataField::Title).toString();
|
||||||
|
author = doc.metaData(QPdfDocument::MetaDataField::Author).toString();
|
||||||
|
subject = doc.metaData(QPdfDocument::MetaDataField::Subject).toString();
|
||||||
|
keywords = doc.metaData(QPdfDocument::MetaDataField::Keywords).toString();
|
||||||
|
// TODO(jared): metadata for Word documents?
|
||||||
}
|
}
|
||||||
#if defined(DEBUG)
|
|
||||||
qDebug() << "scanning byteIndex" << byteIndex << "of" << bytes << document_path;
|
|
||||||
#endif
|
|
||||||
int pos = chunkStream(stream, info.folder, document_id, embedding_model, info.doc.fileName(),
|
|
||||||
QString() /*title*/, QString() /*author*/, QString() /*subject*/, QString() /*keywords*/, -1 /*page*/,
|
|
||||||
100 /*maxChunks*/);
|
|
||||||
if (pos < 0) {
|
|
||||||
if (!file.binarySeen()) {
|
|
||||||
handleDocumentError(u"ERROR: Failed to read file (status %1)"_s.arg(stream.status()),
|
|
||||||
existing_id, document_path, q.lastError());
|
|
||||||
return updateFolderToIndex(folder_id, countForFolder);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* When we see a binary file, we treat it like an empty file so we know not to
|
try {
|
||||||
* scan it again. All existing chunks are removed, and in-progress embeddings
|
m_chunkStreamer.setDocument(info, document_id, embedding_model, title, author, subject, keywords);
|
||||||
* are ignored when they complete. */
|
} catch (const std::runtime_error &e) {
|
||||||
|
qWarning() << "LocalDocs ERROR:" << e.what();
|
||||||
qInfo() << "LocalDocs: Ignoring file with binary data:" << document_path;
|
goto dequeue;
|
||||||
|
|
||||||
// this will also ensure in-flight embeddings are ignored
|
|
||||||
if (!removeChunksByDocumentId(q, existing_id)) {
|
|
||||||
handleDocumentError("ERROR: Cannot remove chunks of document",
|
|
||||||
existing_id, document_path, q.lastError());
|
|
||||||
}
|
|
||||||
updateCollectionStatistics();
|
|
||||||
return updateFolderToIndex(folder_id, countForFolder);
|
|
||||||
}
|
|
||||||
file.close();
|
|
||||||
const size_t bytesChunked = pos - byteIndex;
|
|
||||||
CollectionItem item = guiCollectionItem(info.folder);
|
|
||||||
item.currentBytesToIndex -= bytesChunked;
|
|
||||||
updateGuiForCollectionItem(item);
|
|
||||||
if (info.currentPosition < bytes) {
|
|
||||||
info.currentPosition = pos;
|
|
||||||
info.currentlyProcessing = true;
|
|
||||||
enqueueDocumentInternal(info, true /*prepend*/);
|
|
||||||
return updateFolderToIndex(folder_id, countForFolder + 1);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
switch (m_chunkStreamer.step()) {
|
||||||
|
case ChunkStreamer::Status::INTERRUPTED:
|
||||||
|
info.currentlyProcessing = true;
|
||||||
|
enqueueDocumentInternal(std::move(info), /*prepend*/ true);
|
||||||
|
return updateFolderToIndex(folder_id, countForFolder + 1);
|
||||||
|
case ChunkStreamer::Status::BINARY_SEEN:
|
||||||
|
/* When we see a binary file, we treat it like an empty file so we know not to
|
||||||
|
* scan it again. All existing chunks are removed, and in-progress embeddings
|
||||||
|
* are ignored when they complete. */
|
||||||
|
qInfo() << "LocalDocs: Ignoring file with binary data:" << document_path;
|
||||||
|
|
||||||
|
// this will also ensure in-flight embeddings are ignored
|
||||||
|
if (!removeChunksByDocumentId(q, existing_id))
|
||||||
|
handleDocumentError("ERROR: Cannot remove chunks of document", existing_id, document_path, q.lastError());
|
||||||
|
updateCollectionStatistics();
|
||||||
|
break;
|
||||||
|
case ChunkStreamer::Status::ERROR:
|
||||||
|
qWarning() << "error reading" << document_path;
|
||||||
|
break;
|
||||||
|
case ChunkStreamer::Status::DOC_COMPLETE:
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
|
dequeue:
|
||||||
|
auto item = guiCollectionItem(folder_id);
|
||||||
|
item.currentBytesToIndex -= info.file.size();
|
||||||
|
updateGuiForCollectionItem(item);
|
||||||
return updateFolderToIndex(folder_id, countForFolder);
|
return updateFolderToIndex(folder_id, countForFolder);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1502,7 +1715,7 @@ void Database::scanDocuments(int folder_id, const QString &folder_path)
|
|||||||
|
|
||||||
QDirIterator it(folder_path, QDir::Readable | QDir::Files | QDir::Dirs | QDir::NoDotAndDotDot,
|
QDirIterator it(folder_path, QDir::Readable | QDir::Files | QDir::Dirs | QDir::NoDotAndDotDot,
|
||||||
QDirIterator::Subdirectories);
|
QDirIterator::Subdirectories);
|
||||||
QVector<DocumentInfo> infos;
|
std::list<DocumentInfo> infos;
|
||||||
while (it.hasNext()) {
|
while (it.hasNext()) {
|
||||||
it.next();
|
it.next();
|
||||||
QFileInfo fileInfo = it.fileInfo();
|
QFileInfo fileInfo = it.fileInfo();
|
||||||
@ -1514,17 +1727,14 @@ void Database::scanDocuments(int folder_id, const QString &folder_path)
|
|||||||
if (!m_scannedFileExtensions.contains(fileInfo.suffix(), Qt::CaseInsensitive))
|
if (!m_scannedFileExtensions.contains(fileInfo.suffix(), Qt::CaseInsensitive))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
DocumentInfo info;
|
infos.push_back({ folder_id, fileInfo });
|
||||||
info.folder = folder_id;
|
|
||||||
info.doc = fileInfo;
|
|
||||||
infos.append(info);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!infos.isEmpty()) {
|
if (!infos.empty()) {
|
||||||
CollectionItem item = guiCollectionItem(folder_id);
|
CollectionItem item = guiCollectionItem(folder_id);
|
||||||
item.indexing = true;
|
item.indexing = true;
|
||||||
updateGuiForCollectionItem(item);
|
updateGuiForCollectionItem(item);
|
||||||
enqueueDocuments(folder_id, infos);
|
enqueueDocuments(folder_id, std::move(infos));
|
||||||
} else {
|
} else {
|
||||||
updateFolderToIndex(folder_id, 0, false);
|
updateFolderToIndex(folder_id, 0, false);
|
||||||
}
|
}
|
||||||
@ -1535,7 +1745,7 @@ void Database::start()
|
|||||||
connect(m_watcher, &QFileSystemWatcher::directoryChanged, this, &Database::directoryChanged);
|
connect(m_watcher, &QFileSystemWatcher::directoryChanged, this, &Database::directoryChanged);
|
||||||
connect(m_embLLM, &EmbeddingLLM::embeddingsGenerated, this, &Database::handleEmbeddingsGenerated);
|
connect(m_embLLM, &EmbeddingLLM::embeddingsGenerated, this, &Database::handleEmbeddingsGenerated);
|
||||||
connect(m_embLLM, &EmbeddingLLM::errorGenerated, this, &Database::handleErrorGenerated);
|
connect(m_embLLM, &EmbeddingLLM::errorGenerated, this, &Database::handleErrorGenerated);
|
||||||
m_scanTimer->callOnTimeout(this, &Database::scanQueueBatch);
|
m_scanIntervalTimer->callOnTimeout(this, &Database::scanQueueBatch);
|
||||||
|
|
||||||
const QString modelPath = MySettings::globalInstance()->modelPath();
|
const QString modelPath = MySettings::globalInstance()->modelPath();
|
||||||
QList<CollectionItem> oldCollections;
|
QList<CollectionItem> oldCollections;
|
||||||
|
@ -3,14 +3,15 @@
|
|||||||
|
|
||||||
#include "embllm.h" // IWYU pragma: keep
|
#include "embllm.h" // IWYU pragma: keep
|
||||||
|
|
||||||
|
#include <QByteArray>
|
||||||
|
#include <QChar>
|
||||||
#include <QDateTime>
|
#include <QDateTime>
|
||||||
|
#include <QElapsedTimer>
|
||||||
#include <QFileInfo>
|
#include <QFileInfo>
|
||||||
#include <QHash>
|
#include <QHash>
|
||||||
#include <QLatin1String>
|
#include <QLatin1String>
|
||||||
#include <QList>
|
#include <QList>
|
||||||
#include <QMap>
|
|
||||||
#include <QObject>
|
#include <QObject>
|
||||||
#include <QQueue>
|
|
||||||
#include <QSet>
|
#include <QSet>
|
||||||
#include <QSqlDatabase>
|
#include <QSqlDatabase>
|
||||||
#include <QString>
|
#include <QString>
|
||||||
@ -18,13 +19,23 @@
|
|||||||
#include <QThread>
|
#include <QThread>
|
||||||
#include <QUrl>
|
#include <QUrl>
|
||||||
#include <QVector>
|
#include <QVector>
|
||||||
|
#include <QtGlobal>
|
||||||
|
|
||||||
|
#include <atomic>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
|
#include <list>
|
||||||
|
#include <map>
|
||||||
|
#include <memory>
|
||||||
|
#include <optional>
|
||||||
|
#include <utility>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
using namespace Qt::Literals::StringLiterals;
|
using namespace Qt::Literals::StringLiterals;
|
||||||
|
|
||||||
|
class Database;
|
||||||
|
class DocumentReader;
|
||||||
class QFileSystemWatcher;
|
class QFileSystemWatcher;
|
||||||
class QSqlError;
|
class QSqlQuery;
|
||||||
class QTextStream;
|
class QTextStream;
|
||||||
class QTimer;
|
class QTimer;
|
||||||
|
|
||||||
@ -39,14 +50,16 @@ static const int LOCALDOCS_VERSION = 3;
|
|||||||
|
|
||||||
struct DocumentInfo
|
struct DocumentInfo
|
||||||
{
|
{
|
||||||
int folder;
|
using key_type = std::pair<int, QString>;
|
||||||
QFileInfo doc;
|
|
||||||
int currentPage = 0;
|
int folder;
|
||||||
size_t currentPosition = 0;
|
QFileInfo file;
|
||||||
bool currentlyProcessing = false;
|
bool currentlyProcessing = false;
|
||||||
bool isPdf() const {
|
|
||||||
return doc.suffix().compare(u"pdf"_s, Qt::CaseInsensitive) == 0;
|
key_type key() const { return {folder, file.canonicalFilePath()}; } // for comparison
|
||||||
}
|
|
||||||
|
bool isPdf () const { return !file.suffix().compare("pdf"_L1, Qt::CaseInsensitive); }
|
||||||
|
bool isDocx() const { return !file.suffix().compare("docx"_L1, Qt::CaseInsensitive); }
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ResultInfo {
|
struct ResultInfo {
|
||||||
@ -141,6 +154,36 @@ struct CollectionItem {
|
|||||||
};
|
};
|
||||||
Q_DECLARE_METATYPE(CollectionItem)
|
Q_DECLARE_METATYPE(CollectionItem)
|
||||||
|
|
||||||
|
class ChunkStreamer {
|
||||||
|
public:
|
||||||
|
enum class Status { DOC_COMPLETE, INTERRUPTED, ERROR, BINARY_SEEN };
|
||||||
|
|
||||||
|
explicit ChunkStreamer(Database *database);
|
||||||
|
~ChunkStreamer();
|
||||||
|
|
||||||
|
void setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel, const QString &title,
|
||||||
|
const QString &author, const QString &subject, const QString &keywords);
|
||||||
|
|
||||||
|
Status step();
|
||||||
|
|
||||||
|
private:
|
||||||
|
Database *m_database;
|
||||||
|
std::optional<DocumentInfo::key_type> m_docKey;
|
||||||
|
std::unique_ptr<DocumentReader> m_reader; // may be invalid, always compare key first
|
||||||
|
int m_documentId;
|
||||||
|
QString m_embeddingModel;
|
||||||
|
QString m_title;
|
||||||
|
QString m_author;
|
||||||
|
QString m_subject;
|
||||||
|
QString m_keywords;
|
||||||
|
bool m_atStart;
|
||||||
|
|
||||||
|
// working state
|
||||||
|
QString m_chunk; // has a trailing space for convenience
|
||||||
|
int m_nChunkWords = 0;
|
||||||
|
int m_page = 0;
|
||||||
|
};
|
||||||
|
|
||||||
class Database : public QObject
|
class Database : public QObject
|
||||||
{
|
{
|
||||||
Q_OBJECT
|
Q_OBJECT
|
||||||
@ -152,6 +195,7 @@ public:
|
|||||||
|
|
||||||
public Q_SLOTS:
|
public Q_SLOTS:
|
||||||
void start();
|
void start();
|
||||||
|
bool scanQueueInterrupted() const;
|
||||||
void scanQueueBatch();
|
void scanQueueBatch();
|
||||||
void scanDocuments(int folder_id, const QString &folder_path);
|
void scanDocuments(int folder_id, const QString &folder_path);
|
||||||
void forceIndexing(const QString &collection, const QString &embedding_model);
|
void forceIndexing(const QString &collection, const QString &embedding_model);
|
||||||
@ -194,14 +238,12 @@ private:
|
|||||||
void appendChunk(const EmbeddingChunk &chunk);
|
void appendChunk(const EmbeddingChunk &chunk);
|
||||||
void sendChunkList();
|
void sendChunkList();
|
||||||
void updateFolderToIndex(int folder_id, size_t countForFolder, bool sendChunks = true);
|
void updateFolderToIndex(int folder_id, size_t countForFolder, bool sendChunks = true);
|
||||||
void handleDocumentError(const QString &errorMessage,
|
|
||||||
int document_id, const QString &document_path, const QSqlError &error);
|
|
||||||
size_t countOfDocuments(int folder_id) const;
|
size_t countOfDocuments(int folder_id) const;
|
||||||
size_t countOfBytes(int folder_id) const;
|
size_t countOfBytes(int folder_id) const;
|
||||||
DocumentInfo dequeueDocument();
|
DocumentInfo dequeueDocument();
|
||||||
void removeFolderFromDocumentQueue(int folder_id);
|
void removeFolderFromDocumentQueue(int folder_id);
|
||||||
void enqueueDocumentInternal(const DocumentInfo &info, bool prepend = false);
|
void enqueueDocumentInternal(DocumentInfo &&info, bool prepend = false);
|
||||||
void enqueueDocuments(int folder_id, const QVector<DocumentInfo> &infos);
|
void enqueueDocuments(int folder_id, std::list<DocumentInfo> &&infos);
|
||||||
void scanQueue();
|
void scanQueue();
|
||||||
bool cleanDB();
|
bool cleanDB();
|
||||||
void addFolderToWatch(const QString &path);
|
void addFolderToWatch(const QString &path);
|
||||||
@ -240,8 +282,9 @@ private:
|
|||||||
QSqlDatabase m_db;
|
QSqlDatabase m_db;
|
||||||
int m_chunkSize;
|
int m_chunkSize;
|
||||||
QStringList m_scannedFileExtensions;
|
QStringList m_scannedFileExtensions;
|
||||||
QTimer *m_scanTimer;
|
QTimer *m_scanIntervalTimer;
|
||||||
QMap<int, QQueue<DocumentInfo>> m_docsToScan;
|
QElapsedTimer m_scanDurationTimer;
|
||||||
|
std::map<int, std::list<DocumentInfo>> m_docsToScan;
|
||||||
QList<ResultInfo> m_retrieve;
|
QList<ResultInfo> m_retrieve;
|
||||||
QThread m_dbThread;
|
QThread m_dbThread;
|
||||||
QFileSystemWatcher *m_watcher;
|
QFileSystemWatcher *m_watcher;
|
||||||
@ -250,6 +293,9 @@ private:
|
|||||||
QVector<EmbeddingChunk> m_chunkList;
|
QVector<EmbeddingChunk> m_chunkList;
|
||||||
QHash<int, CollectionItem> m_collectionMap; // used only for tracking indexing/embedding progress
|
QHash<int, CollectionItem> m_collectionMap; // used only for tracking indexing/embedding progress
|
||||||
std::atomic<bool> m_databaseValid;
|
std::atomic<bool> m_databaseValid;
|
||||||
|
ChunkStreamer m_chunkStreamer;
|
||||||
|
|
||||||
|
friend class ChunkStreamer;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // DATABASE_H
|
#endif // DATABASE_H
|
||||||
|
@ -55,7 +55,7 @@ static const QVariantMap basicDefaults {
|
|||||||
{ "localdocs/chunkSize", 512 },
|
{ "localdocs/chunkSize", 512 },
|
||||||
{ "localdocs/retrievalSize", 3 },
|
{ "localdocs/retrievalSize", 3 },
|
||||||
{ "localdocs/showReferences", true },
|
{ "localdocs/showReferences", true },
|
||||||
{ "localdocs/fileExtensions", QStringList { "txt", "pdf", "md", "rst" } },
|
{ "localdocs/fileExtensions", QStringList { "docx", "pdf", "txt", "md", "rst" } },
|
||||||
{ "localdocs/useRemoteEmbed", false },
|
{ "localdocs/useRemoteEmbed", false },
|
||||||
{ "localdocs/nomicAPIKey", "" },
|
{ "localdocs/nomicAPIKey", "" },
|
||||||
{ "localdocs/embedDevice", "Auto" },
|
{ "localdocs/embedDevice", "Auto" },
|
||||||
|
@ -3,8 +3,8 @@
|
|||||||
#include "chat.h"
|
#include "chat.h"
|
||||||
#include "modellist.h"
|
#include "modellist.h"
|
||||||
#include "mysettings.h"
|
#include "mysettings.h"
|
||||||
|
#include "utils.h"
|
||||||
|
|
||||||
#include <fmt/base.h>
|
|
||||||
#include <fmt/format.h>
|
#include <fmt/format.h>
|
||||||
|
|
||||||
#include <QByteArray>
|
#include <QByteArray>
|
||||||
@ -25,9 +25,9 @@
|
|||||||
#include <QVariant>
|
#include <QVariant>
|
||||||
#include <Qt>
|
#include <Qt>
|
||||||
#include <QtCborCommon>
|
#include <QtCborCommon>
|
||||||
|
#include <QtGlobal>
|
||||||
#include <QtLogging>
|
#include <QtLogging>
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <optional>
|
#include <optional>
|
||||||
@ -37,26 +37,12 @@
|
|||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
namespace ranges = std::ranges;
|
|
||||||
using namespace std::string_literals;
|
using namespace std::string_literals;
|
||||||
using namespace Qt::Literals::StringLiterals;
|
using namespace Qt::Literals::StringLiterals;
|
||||||
|
|
||||||
//#define DEBUG
|
//#define DEBUG
|
||||||
|
|
||||||
|
|
||||||
#define MAKE_FORMATTER(type, conversion) \
|
|
||||||
template <> \
|
|
||||||
struct fmt::formatter<type, char>: fmt::formatter<std::string, char> { \
|
|
||||||
template <typename FmtContext> \
|
|
||||||
FmtContext::iterator format(const type &value, FmtContext &ctx) const \
|
|
||||||
{ \
|
|
||||||
return formatter<std::string, char>::format(conversion, ctx); \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
|
|
||||||
MAKE_FORMATTER(QString, value.toStdString() );
|
|
||||||
MAKE_FORMATTER(QVariant, value.toString().toStdString());
|
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
class InvalidRequestError: public std::invalid_argument {
|
class InvalidRequestError: public std::invalid_argument {
|
||||||
|
25
gpt4all-chat/src/utils.h
Normal file
25
gpt4all-chat/src/utils.h
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <fmt/base.h>
|
||||||
|
#include <fmt/format.h>
|
||||||
|
|
||||||
|
#include <QString>
|
||||||
|
#include <QVariant>
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
|
||||||
|
// fmtlib formatters for QString and QVariant
|
||||||
|
|
||||||
|
#define MAKE_FORMATTER(type, conversion) \
|
||||||
|
template <> \
|
||||||
|
struct fmt::formatter<type, char>: fmt::formatter<std::string, char> { \
|
||||||
|
template <typename FmtContext> \
|
||||||
|
FmtContext::iterator format(const type &value, FmtContext &ctx) const \
|
||||||
|
{ \
|
||||||
|
return formatter<std::string, char>::format(conversion, ctx); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
MAKE_FORMATTER(QString, value.toStdString() );
|
||||||
|
MAKE_FORMATTER(QVariant, value.toString().toStdString());
|
Loading…
Reference in New Issue
Block a user