localdocs: implement .docx support (#2986)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel 2024-09-30 18:48:13 -04:00 committed by GitHub
parent ea1ade8668
commit e190fd0204
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 516 additions and 242 deletions

3
.gitmodules vendored
View File

@ -11,3 +11,6 @@
[submodule "gpt4all-chat/deps/fmt"] [submodule "gpt4all-chat/deps/fmt"]
path = gpt4all-chat/deps/fmt path = gpt4all-chat/deps/fmt
url = https://github.com/fmtlib/fmt.git url = https://github.com/fmtlib/fmt.git
[submodule "gpt4all-chat/deps/DuckX"]
path = gpt4all-chat/deps/DuckX
url = https://github.com/nomic-ai/DuckX.git

View File

@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
### Added ### Added
- Add bm25 hybrid search to localdocs ([#2969](https://github.com/nomic-ai/gpt4all/pull/2969)) - Add bm25 hybrid search to localdocs ([#2969](https://github.com/nomic-ai/gpt4all/pull/2969))
- LocalDocs support for .docx files ([#2986](https://github.com/nomic-ai/gpt4all/pull/2986))
### Changed ### Changed
- Rebase llama.cpp on latest upstream as of September 26th ([#2998](https://github.com/nomic-ai/gpt4all/pull/2998)) - Rebase llama.cpp on latest upstream as of September 26th ([#2998](https://github.com/nomic-ai/gpt4all/pull/2998))

View File

@ -86,14 +86,9 @@ get_filename_component(Qt6_ROOT_DIR "${Qt6_ROOT_DIR}/.." ABSOLUTE)
message(STATUS "qmake binary: ${QMAKE_EXECUTABLE}") message(STATUS "qmake binary: ${QMAKE_EXECUTABLE}")
message(STATUS "Qt 6 root directory: ${Qt6_ROOT_DIR}") message(STATUS "Qt 6 root directory: ${Qt6_ROOT_DIR}")
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(FMT_INSTALL OFF)
set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}")
set(BUILD_SHARED_LIBS OFF)
add_subdirectory(deps/fmt)
set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}")
add_subdirectory(deps)
add_subdirectory(../gpt4all-backend llmodel) add_subdirectory(../gpt4all-backend llmodel)
set(CHAT_EXE_RESOURCES) set(CHAT_EXE_RESOURCES)
@ -133,9 +128,6 @@ if (APPLE)
list(APPEND CHAT_EXE_RESOURCES "${LOCAL_EMBEDDING_MODEL_PATH}") list(APPEND CHAT_EXE_RESOURCES "${LOCAL_EMBEDDING_MODEL_PATH}")
endif() endif()
set(QAPPLICATION_CLASS QGuiApplication)
add_subdirectory(deps/SingleApplication)
if (DEFINED GGML_METALLIB) if (DEFINED GGML_METALLIB)
set_source_files_properties("${GGML_METALLIB}" PROPERTIES GENERATED ON) set_source_files_properties("${GGML_METALLIB}" PROPERTIES GENERATED ON)
endif() endif()
@ -335,7 +327,7 @@ target_include_directories(chat PRIVATE deps/usearch/include
target_link_libraries(chat target_link_libraries(chat
PRIVATE Qt6::Core Qt6::HttpServer Qt6::Pdf Qt6::Quick Qt6::Sql Qt6::Svg) PRIVATE Qt6::Core Qt6::HttpServer Qt6::Pdf Qt6::Quick Qt6::Sql Qt6::Svg)
target_link_libraries(chat target_link_libraries(chat
PRIVATE llmodel SingleApplication fmt::fmt) PRIVATE llmodel SingleApplication fmt::fmt duckx::duckx)
# -- install -- # -- install --

View File

@ -0,0 +1,10 @@
set(BUILD_SHARED_LIBS OFF)
set(FMT_INSTALL OFF)
add_subdirectory(fmt)
set(QAPPLICATION_CLASS QGuiApplication)
add_subdirectory(SingleApplication)
set(DUCKX_INSTALL OFF)
add_subdirectory(DuckX)

@ -0,0 +1 @@
Subproject commit 6e31dfb280e2107fbf4f6a15098c38b014f1bbcc

@ -1 +1 @@
Subproject commit 22cfa3bd00ea542132ee826cdb220f9d6434bd43 Subproject commit 1f0618a86f9dbb7386237241cee96cc425dd7b55

View File

@ -70,7 +70,7 @@ MySettingsTab {
/* Blacklist common unsupported file extensions. We only support plain text and PDFs, and although we /* Blacklist common unsupported file extensions. We only support plain text and PDFs, and although we
* reject binary data, we don't want to waste time trying to index files that we don't support. */ * reject binary data, we don't want to waste time trying to index files that we don't support. */
exts = exts.filter(e => ![ exts = exts.filter(e => ![
/* Microsoft documents */ "rtf", "docx", "ppt", "pptx", "xls", "xlsx", /* Microsoft documents */ "rtf", "ppt", "pptx", "xls", "xlsx",
/* OpenOffice */ "odt", "ods", "odp", "odg", /* OpenOffice */ "odt", "ods", "odp", "odg",
/* photos */ "jpg", "jpeg", "png", "gif", "bmp", "tif", "tiff", "webp", /* photos */ "jpg", "jpeg", "png", "gif", "bmp", "tif", "tiff", "webp",
/* audio */ "mp3", "wma", "m4a", "wav", "flac", /* audio */ "mp3", "wma", "m4a", "wav", "flac",

View File

@ -1,13 +1,15 @@
#include "database.h" #include "database.h"
#include "mysettings.h" #include "mysettings.h"
#include "utils.h"
#include <duckx/duckx.hpp>
#include <fmt/format.h>
#include <usearch/index_plugins.hpp> #include <usearch/index_plugins.hpp>
#include <QDebug> #include <QDebug>
#include <QDir> #include <QDir>
#include <QDirIterator> #include <QDirIterator>
#include <QElapsedTimer>
#include <QFile> #include <QFile>
#include <QFileSystemWatcher> #include <QFileSystemWatcher>
#include <QIODevice> #include <QIODevice>
@ -18,16 +20,16 @@
#include <QSqlQuery> #include <QSqlQuery>
#include <QTextStream> #include <QTextStream>
#include <QTimer> #include <QTimer>
#include <QMap>
#include <QUtf8StringView>
#include <QVariant> #include <QVariant>
#include <Qt> #include <Qt>
#include <QtGlobal>
#include <QtLogging> #include <QtLogging>
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <optional> #include <optional>
#include <utility> #include <stdexcept>
#include <vector>
using namespace Qt::Literals::StringLiterals; using namespace Qt::Literals::StringLiterals;
namespace us = unum::usearch; namespace us = unum::usearch;
@ -991,10 +993,11 @@ Database::Database(int chunkSize, QStringList extensions)
: QObject(nullptr) : QObject(nullptr)
, m_chunkSize(chunkSize) , m_chunkSize(chunkSize)
, m_scannedFileExtensions(std::move(extensions)) , m_scannedFileExtensions(std::move(extensions))
, m_scanTimer(new QTimer(this)) , m_scanIntervalTimer(new QTimer(this))
, m_watcher(new QFileSystemWatcher(this)) , m_watcher(new QFileSystemWatcher(this))
, m_embLLM(new EmbeddingLLM) , m_embLLM(new EmbeddingLLM)
, m_databaseValid(true) , m_databaseValid(true)
, m_chunkStreamer(this)
{ {
m_db = QSqlDatabase::database(QSqlDatabase::defaultConnection, false); m_db = QSqlDatabase::database(QSqlDatabase::defaultConnection, false);
if (!m_db.isValid()) if (!m_db.isValid())
@ -1080,87 +1083,345 @@ void Database::updateFolderToIndex(int folder_id, size_t countForFolder, bool se
updateGuiForCollectionItem(item); updateGuiForCollectionItem(item);
} }
void Database::handleDocumentError(const QString &errorMessage, static void handleDocumentError(const QString &errorMessage, int document_id, const QString &document_path,
int document_id, const QString &document_path, const QSqlError &error) const QSqlError &error)
{ {
qWarning() << errorMessage << document_id << document_path << error; qWarning() << errorMessage << document_id << document_path << error;
} }
size_t Database::chunkStream(QTextStream &stream, int folder_id, int document_id, const QString &embedding_model, class DocumentReader {
const QString &file, const QString &title, const QString &author, const QString &subject, const QString &keywords, public:
int page, int maxChunks) static std::unique_ptr<DocumentReader> fromDocument(const DocumentInfo &info);
const DocumentInfo &doc () const { return *m_info; }
const std::optional<QString> &word () const { return m_word; }
const std::optional<QString> &nextWord() { m_word = advance(); return m_word; }
virtual std::optional<ChunkStreamer::Status> getError() const { return std::nullopt; }
virtual int page() const { return -1; }
virtual ~DocumentReader() = default;
protected:
explicit DocumentReader(const DocumentInfo &info)
: m_info(&info) {}
void postInit() { m_word = advance(); }
virtual std::optional<QString> advance() = 0;
const DocumentInfo *m_info;
std::optional<QString> m_word;
};
namespace {
class PdfDocumentReader final : public DocumentReader {
public:
explicit PdfDocumentReader(const DocumentInfo &info)
: DocumentReader(info)
{
QString path = info.file.canonicalFilePath();
if (m_doc.load(path) != QPdfDocument::Error::None)
throw std::runtime_error(fmt::format("Failed to load PDF: {}", path));
postInit();
}
int page() const override { return m_currentPage; }
private:
std::optional<QString> advance() override
{
QString word;
do {
while (!m_stream || m_stream->atEnd()) {
if (m_currentPage >= m_doc.pageCount())
return std::nullopt;
m_pageText = m_doc.getAllText(m_currentPage++).text();
m_stream.emplace(&m_pageText);
}
*m_stream >> word;
} while (word.isEmpty());
return word;
}
QPdfDocument m_doc;
int m_currentPage = 0;
QString m_pageText;
std::optional<QTextStream> m_stream;
};
class WordDocumentReader final : public DocumentReader {
public:
explicit WordDocumentReader(const DocumentInfo &info)
: DocumentReader(info)
, m_doc(info.file.canonicalFilePath().toStdString())
{
m_doc.open();
if (!m_doc.is_open())
throw std::runtime_error(fmt::format("Failed to open DOCX: {}", info.file.canonicalFilePath()));
m_paragraph = &m_doc.paragraphs();
m_run = &m_paragraph->runs();
postInit();
}
protected:
std::optional<QString> advance() override
{
// find non-space char
qsizetype wordStart = 0;
while (m_buffer.isEmpty() || m_buffer[wordStart].isSpace()) {
if (m_buffer.isEmpty() && !fillBuffer())
return std::nullopt;
if (m_buffer[wordStart].isSpace() && ++wordStart >= m_buffer.size()) {
m_buffer.clear();
wordStart = 0;
}
}
// find space char
qsizetype wordEnd = wordStart + 1;
while (wordEnd >= m_buffer.size() || !m_buffer[wordEnd].isSpace()) {
if (wordEnd >= m_buffer.size() && !fillBuffer())
return std::nullopt;
if (!m_buffer[wordEnd].isSpace())
++wordEnd;
}
auto size = wordEnd - wordStart;
QString word = std::move(m_buffer);
m_buffer = word.sliced(wordStart + size);
if (wordStart == 0)
word.resize(size);
else
word = word.sliced(wordStart, size);
return word;
}
bool fillBuffer()
{
for (;;) {
// get a run
while (!m_run->has_next()) {
// try next paragraph
if (!m_paragraph->has_next())
return false;
m_paragraph->next();
m_buffer += u'\n';
}
auto &run = m_run->get_node();
const char *text = run.child("w:t").text().get();
if (!*text && run.child("w:tab"))
text = "\t";
m_run->next();
if (*text) {
m_buffer += QUtf8StringView(text);
return true;
}
}
}
duckx::Document m_doc;
duckx::Paragraph *m_paragraph;
duckx::Run *m_run;
QString m_buffer;
};
class TxtDocumentReader final : public DocumentReader {
public:
explicit TxtDocumentReader(const DocumentInfo &info)
: DocumentReader(info)
, m_file(info.file.canonicalFilePath())
{
if (!m_file.open(QIODevice::ReadOnly))
throw std::runtime_error(fmt::format("Failed to open text file: {}", m_file.fileName()));
m_stream.setDevice(&m_file);
postInit();
}
protected:
std::optional<QString> advance() override
{
while (!m_stream.atEnd()) {
QString word;
m_stream >> word;
if (!word.isEmpty())
return word;
}
return std::nullopt;
}
std::optional<ChunkStreamer::Status> getError() const override
{
if (!m_file.error())
return std::nullopt;
return m_file.binarySeen() ? ChunkStreamer::Status::BINARY_SEEN : ChunkStreamer::Status::ERROR;
}
BinaryDetectingFile m_file;
QTextStream m_stream;
};
} // namespace
std::unique_ptr<DocumentReader> DocumentReader::fromDocument(const DocumentInfo &doc)
{
if (doc.isPdf())
return std::make_unique<PdfDocumentReader>(doc);
if (doc.isDocx())
return std::make_unique<WordDocumentReader>(doc);
return std::make_unique<TxtDocumentReader>(doc);
}
ChunkStreamer::ChunkStreamer(Database *database)
: m_database(database) {}
ChunkStreamer::~ChunkStreamer() = default;
void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel,
const QString &title, const QString &author, const QString &subject,
const QString &keywords)
{
auto docKey = doc.key();
if (!m_docKey || *m_docKey != docKey) {
m_docKey = docKey;
m_reader = DocumentReader::fromDocument(doc);
m_documentId = documentId;
m_embeddingModel = embeddingModel;
m_title = title;
m_author = author;
m_subject = subject;
m_keywords = keywords;
m_chunk.clear();
m_page = 0;
// make sure the document doesn't already have any chunks
QSqlQuery q(m_database->m_db);
if (!removeChunksByDocumentId(q, documentId))
handleDocumentError("ERROR: Cannot remove chunks of document", documentId, doc.file.canonicalPath(), q.lastError());
}
}
ChunkStreamer::Status ChunkStreamer::step()
{ {
int charCount = 0;
// TODO: implement line_from/line_to // TODO: implement line_from/line_to
constexpr int line_from = -1; constexpr int line_from = -1;
constexpr int line_to = -1; constexpr int line_to = -1;
QList<QString> words; const int folderId = m_reader->doc().folder;
int chunks = 0; const int maxChunkSize = m_database->m_chunkSize;
int addedWords = 0; int nChunks = 0;
int nAddedWords = 0;
Status retval;
for (;;) { for (;;) {
QString word; if (auto error = m_reader->getError())
stream >> word; return *error;
if (stream.status() && !stream.atEnd()) if (m_database->scanQueueInterrupted()) {
return -1; retval = Status::INTERRUPTED;
charCount += word.length(); break;
if (!word.isEmpty()) }
words.append(word);
if (stream.status() || charCount + words.size() - 1 >= m_chunkSize) { // get a word, if needed
if (!words.isEmpty()) { std::optional<QString> word = QString(); // empty string to disable EOF logic
const QString chunk = words.join(" "); if (m_chunk.length() < maxChunkSize + 1) {
QSqlQuery q(m_db); word = m_reader->word();
int chunk_id = 0; if (m_chunk.isEmpty())
m_page = m_reader->page(); // page number of first word
if (word) {
m_chunk += *word;
m_chunk += u' ';
m_reader->nextWord();
m_nChunkWords++;
}
}
if (!word || m_chunk.length() >= maxChunkSize + 1) { // +1 for leading space
if (!m_chunk.isEmpty()) {
int nThisChunkWords = 0;
auto chunk = m_chunk; // copy
// handle overlength chunks
if (m_chunk.length() > maxChunkSize + 1) {
// find the final space
qsizetype lastSpace = chunk.lastIndexOf(u' ', -2);
if (lastSpace < 0) {
// slice off the last word
Q_ASSERT(m_nChunkWords >= 1);
lastSpace = maxChunkSize;
nThisChunkWords = m_nChunkWords - 1;
m_nChunkWords = 1;
} else {
// slice the overlong word
nThisChunkWords = m_nChunkWords;
m_nChunkWords = 0;
}
// save the extra part
m_chunk = chunk.sliced(lastSpace + 1);
// slice
chunk.truncate(lastSpace + 1);
Q_ASSERT(chunk.length() <= maxChunkSize + 1);
} else {
nThisChunkWords = m_nChunkWords;
m_nChunkWords = 0;
}
QSqlQuery q(m_database->m_db);
int chunkId = 0;
if (!addChunk(q, if (!addChunk(q,
document_id, m_documentId,
chunk, chunk.chopped(1), // strip trailing space
file, m_reader->doc().file.canonicalFilePath(),
title, m_title,
author, m_author,
subject, m_subject,
keywords, m_keywords,
page, m_page,
line_from, line_from,
line_to, line_to,
words.size(), nThisChunkWords,
&chunk_id &chunkId
)) { )) {
qWarning() << "ERROR: Could not insert chunk into db" << q.lastError(); qWarning() << "ERROR: Could not insert chunk into db" << q.lastError();
} }
addedWords += words.size(); nAddedWords += nThisChunkWords;
EmbeddingChunk toEmbed; EmbeddingChunk toEmbed;
toEmbed.model = embedding_model; toEmbed.model = m_embeddingModel;
toEmbed.folder_id = folder_id; toEmbed.folder_id = folderId;
toEmbed.chunk_id = chunk_id; toEmbed.chunk_id = chunkId;
toEmbed.chunk = chunk; toEmbed.chunk = chunk;
appendChunk(toEmbed); m_database->appendChunk(toEmbed);
++chunks; ++nChunks;
words.clear(); m_chunk.clear();
charCount = 0;
} }
if (stream.status() || (maxChunks > 0 && chunks == maxChunks)) if (!word) {
retval = Status::DOC_COMPLETE;
break; break;
}
} }
} }
if (chunks) { if (nChunks) {
CollectionItem item = guiCollectionItem(folder_id); CollectionItem item = m_database->guiCollectionItem(folderId);
// Set the start update if we haven't done so already // Set the start update if we haven't done so already
if (item.startUpdate <= item.lastUpdate && item.currentEmbeddingsToIndex == 0) if (item.startUpdate <= item.lastUpdate && item.currentEmbeddingsToIndex == 0)
setStartUpdateTime(item); m_database->setStartUpdateTime(item);
item.currentEmbeddingsToIndex += chunks; item.currentEmbeddingsToIndex += nChunks;
item.totalEmbeddingsToIndex += chunks; item.totalEmbeddingsToIndex += nChunks;
item.totalWords += addedWords; item.totalWords += nAddedWords;
updateGuiForCollectionItem(item); m_database->updateGuiForCollectionItem(item);
} }
return stream.pos(); return retval;
} }
void Database::appendChunk(const EmbeddingChunk &chunk) void Database::appendChunk(const EmbeddingChunk &chunk)
@ -1238,83 +1499,82 @@ void Database::handleErrorGenerated(const QVector<EmbeddingChunk> &chunks, const
size_t Database::countOfDocuments(int folder_id) const size_t Database::countOfDocuments(int folder_id) const
{ {
if (!m_docsToScan.contains(folder_id)) if (auto it = m_docsToScan.find(folder_id); it != m_docsToScan.end())
return 0; return it->second.size();
return m_docsToScan.value(folder_id).size(); return 0;
} }
size_t Database::countOfBytes(int folder_id) const size_t Database::countOfBytes(int folder_id) const
{ {
if (!m_docsToScan.contains(folder_id)) if (auto it = m_docsToScan.find(folder_id); it != m_docsToScan.end()) {
return 0; size_t totalBytes = 0;
size_t totalBytes = 0; for (const DocumentInfo &f : it->second)
const QQueue<DocumentInfo> &docs = m_docsToScan.value(folder_id); totalBytes += f.file.size();
for (const DocumentInfo &f : docs) return totalBytes;
totalBytes += f.doc.size(); }
return totalBytes; return 0;
} }
DocumentInfo Database::dequeueDocument() DocumentInfo Database::dequeueDocument()
{ {
Q_ASSERT(!m_docsToScan.isEmpty()); Q_ASSERT(!m_docsToScan.empty());
const int firstKey = m_docsToScan.firstKey(); auto firstEntry = m_docsToScan.begin();
QQueue<DocumentInfo> &queue = m_docsToScan[firstKey]; auto &[firstKey, queue] = *firstEntry;
Q_ASSERT(!queue.isEmpty()); Q_ASSERT(!queue.empty());
DocumentInfo result = queue.dequeue(); DocumentInfo result = std::move(queue.front());
if (queue.isEmpty()) queue.pop_front();
m_docsToScan.remove(firstKey); if (queue.empty())
m_docsToScan.erase(firstEntry);
return result; return result;
} }
void Database::removeFolderFromDocumentQueue(int folder_id) void Database::removeFolderFromDocumentQueue(int folder_id)
{ {
if (!m_docsToScan.contains(folder_id)) if (auto it = m_docsToScan.find(folder_id); it != m_docsToScan.end())
return; m_docsToScan.erase(it);
m_docsToScan.remove(folder_id);
} }
void Database::enqueueDocumentInternal(const DocumentInfo &info, bool prepend) void Database::enqueueDocumentInternal(DocumentInfo &&info, bool prepend)
{ {
const int key = info.folder; auto &queue = m_docsToScan[info.folder];
if (!m_docsToScan.contains(key)) queue.insert(prepend ? queue.begin() : queue.end(), std::move(info));
m_docsToScan[key] = QQueue<DocumentInfo>();
if (prepend)
m_docsToScan[key].prepend(info);
else
m_docsToScan[key].enqueue(info);
} }
void Database::enqueueDocuments(int folder_id, const QVector<DocumentInfo> &infos) void Database::enqueueDocuments(int folder_id, std::list<DocumentInfo> &&infos)
{ {
for (int i = 0; i < infos.size(); ++i) // enqueue all documents
enqueueDocumentInternal(infos[i]); auto &queue = m_docsToScan[folder_id];
const size_t count = countOfDocuments(folder_id); queue.splice(queue.end(), std::move(infos));
CollectionItem item = guiCollectionItem(folder_id); CollectionItem item = guiCollectionItem(folder_id);
item.currentDocsToIndex = count; item.currentDocsToIndex = queue.size();
item.totalDocsToIndex = count; item.totalDocsToIndex = queue.size();
const size_t bytes = countOfBytes(folder_id); const size_t bytes = countOfBytes(folder_id);
item.currentBytesToIndex = bytes; item.currentBytesToIndex = bytes;
item.totalBytesToIndex = bytes; item.totalBytesToIndex = bytes;
updateGuiForCollectionItem(item); updateGuiForCollectionItem(item);
m_scanTimer->start(); m_scanIntervalTimer->start();
}
bool Database::scanQueueInterrupted() const
{
return m_scanDurationTimer.elapsed() >= 100;
} }
void Database::scanQueueBatch() void Database::scanQueueBatch()
{ {
QElapsedTimer timer; m_scanDurationTimer.start();
timer.start();
transaction(); transaction();
// scan for up to 100ms or until we run out of documents // scan for up to 100ms or until we run out of documents
while (!m_docsToScan.isEmpty() && timer.elapsed() < 100) while (!m_docsToScan.empty() && !scanQueueInterrupted())
scanQueue(); scanQueue();
commit(); commit();
if (m_docsToScan.isEmpty()) if (m_docsToScan.empty())
m_scanTimer->stop(); m_scanIntervalTimer->stop();
} }
void Database::scanQueue() void Database::scanQueue()
@ -1324,15 +1584,15 @@ void Database::scanQueue()
const int folder_id = info.folder; const int folder_id = info.folder;
// Update info // Update info
info.doc.stat(); info.file.stat();
// If the doc has since been deleted or no longer readable, then we schedule more work and return // If the doc has since been deleted or no longer readable, then we schedule more work and return
// leaving the cleanup for the cleanup handler // leaving the cleanup for the cleanup handler
if (!info.doc.exists() || !info.doc.isReadable()) if (!info.file.exists() || !info.file.isReadable())
return updateFolderToIndex(folder_id, countForFolder); return updateFolderToIndex(folder_id, countForFolder);
const qint64 document_time = info.doc.fileTime(QFile::FileModificationTime).toMSecsSinceEpoch(); const qint64 document_time = info.file.fileTime(QFile::FileModificationTime).toMSecsSinceEpoch();
const QString document_path = info.doc.canonicalFilePath(); const QString document_path = info.file.canonicalFilePath();
const bool currentlyProcessing = info.currentlyProcessing; const bool currentlyProcessing = info.currentlyProcessing;
// Check and see if we already have this document // Check and see if we already have this document
@ -1393,104 +1653,57 @@ void Database::scanQueue()
} }
Q_ASSERT(document_id != -1); Q_ASSERT(document_id != -1);
if (info.isPdf()) {
QPdfDocument doc;
if (QPdfDocument::Error::None != doc.load(info.doc.canonicalFilePath())) {
handleDocumentError("ERROR: Could not load pdf",
document_id, document_path, q.lastError());
return updateFolderToIndex(folder_id, countForFolder);
}
const size_t bytes = info.doc.size();
const size_t bytesPerPage = std::floor(bytes / doc.pageCount());
const int pageIndex = info.currentPage;
#if defined(DEBUG)
qDebug() << "scanning page" << pageIndex << "of" << doc.pageCount() << document_path;
#endif
const QPdfSelection selection = doc.getAllText(pageIndex);
QString text = selection.text();
QTextStream stream(&text);
chunkStream(stream, info.folder, document_id, embedding_model, info.doc.fileName(),
doc.metaData(QPdfDocument::MetaDataField::Title).toString(),
doc.metaData(QPdfDocument::MetaDataField::Author).toString(),
doc.metaData(QPdfDocument::MetaDataField::Subject).toString(),
doc.metaData(QPdfDocument::MetaDataField::Keywords).toString(),
pageIndex + 1
);
CollectionItem item = guiCollectionItem(info.folder);
item.currentBytesToIndex -= bytesPerPage;
updateGuiForCollectionItem(item);
if (info.currentPage < doc.pageCount()) {
info.currentPage += 1;
info.currentlyProcessing = true;
enqueueDocumentInternal(info, true /*prepend*/);
return updateFolderToIndex(folder_id, countForFolder + 1);
}
item.currentBytesToIndex -= bytes - (bytesPerPage * doc.pageCount()); {
updateGuiForCollectionItem(item); QString title, author, subject, keywords;
} else { if (info.isPdf()) {
BinaryDetectingFile file(document_path); QPdfDocument doc;
if (!file.open(QIODevice::ReadOnly)) { if (doc.load(document_path) != QPdfDocument::Error::None) {
handleDocumentError("ERROR: Cannot open file for scanning", qWarning() << "ERROR: Could not load pdf" << document_id << document_path;;
existing_id, document_path, q.lastError());
return updateFolderToIndex(folder_id, countForFolder);
}
Q_ASSERT(!file.isSequential()); // we need to seek
const size_t bytes = info.doc.size();
QTextStream stream(&file);
const size_t byteIndex = info.currentPosition;
if (byteIndex) {
/* Read the Unicode BOM to detect the encoding. Without this, QTextStream will
* always interpret the text as UTF-8 when byteIndex is nonzero. */
stream.read(1);
if (!stream.seek(byteIndex)) {
handleDocumentError("ERROR: Cannot seek to pos for scanning",
existing_id, document_path, q.lastError());
return updateFolderToIndex(folder_id, countForFolder); return updateFolderToIndex(folder_id, countForFolder);
} }
title = doc.metaData(QPdfDocument::MetaDataField::Title).toString();
author = doc.metaData(QPdfDocument::MetaDataField::Author).toString();
subject = doc.metaData(QPdfDocument::MetaDataField::Subject).toString();
keywords = doc.metaData(QPdfDocument::MetaDataField::Keywords).toString();
// TODO(jared): metadata for Word documents?
} }
#if defined(DEBUG)
qDebug() << "scanning byteIndex" << byteIndex << "of" << bytes << document_path;
#endif
int pos = chunkStream(stream, info.folder, document_id, embedding_model, info.doc.fileName(),
QString() /*title*/, QString() /*author*/, QString() /*subject*/, QString() /*keywords*/, -1 /*page*/,
100 /*maxChunks*/);
if (pos < 0) {
if (!file.binarySeen()) {
handleDocumentError(u"ERROR: Failed to read file (status %1)"_s.arg(stream.status()),
existing_id, document_path, q.lastError());
return updateFolderToIndex(folder_id, countForFolder);
}
/* When we see a binary file, we treat it like an empty file so we know not to try {
* scan it again. All existing chunks are removed, and in-progress embeddings m_chunkStreamer.setDocument(info, document_id, embedding_model, title, author, subject, keywords);
* are ignored when they complete. */ } catch (const std::runtime_error &e) {
qWarning() << "LocalDocs ERROR:" << e.what();
qInfo() << "LocalDocs: Ignoring file with binary data:" << document_path; goto dequeue;
// this will also ensure in-flight embeddings are ignored
if (!removeChunksByDocumentId(q, existing_id)) {
handleDocumentError("ERROR: Cannot remove chunks of document",
existing_id, document_path, q.lastError());
}
updateCollectionStatistics();
return updateFolderToIndex(folder_id, countForFolder);
}
file.close();
const size_t bytesChunked = pos - byteIndex;
CollectionItem item = guiCollectionItem(info.folder);
item.currentBytesToIndex -= bytesChunked;
updateGuiForCollectionItem(item);
if (info.currentPosition < bytes) {
info.currentPosition = pos;
info.currentlyProcessing = true;
enqueueDocumentInternal(info, true /*prepend*/);
return updateFolderToIndex(folder_id, countForFolder + 1);
} }
} }
switch (m_chunkStreamer.step()) {
case ChunkStreamer::Status::INTERRUPTED:
info.currentlyProcessing = true;
enqueueDocumentInternal(std::move(info), /*prepend*/ true);
return updateFolderToIndex(folder_id, countForFolder + 1);
case ChunkStreamer::Status::BINARY_SEEN:
/* When we see a binary file, we treat it like an empty file so we know not to
* scan it again. All existing chunks are removed, and in-progress embeddings
* are ignored when they complete. */
qInfo() << "LocalDocs: Ignoring file with binary data:" << document_path;
// this will also ensure in-flight embeddings are ignored
if (!removeChunksByDocumentId(q, existing_id))
handleDocumentError("ERROR: Cannot remove chunks of document", existing_id, document_path, q.lastError());
updateCollectionStatistics();
break;
case ChunkStreamer::Status::ERROR:
qWarning() << "error reading" << document_path;
break;
case ChunkStreamer::Status::DOC_COMPLETE:
;
}
dequeue:
auto item = guiCollectionItem(folder_id);
item.currentBytesToIndex -= info.file.size();
updateGuiForCollectionItem(item);
return updateFolderToIndex(folder_id, countForFolder); return updateFolderToIndex(folder_id, countForFolder);
} }
@ -1502,7 +1715,7 @@ void Database::scanDocuments(int folder_id, const QString &folder_path)
QDirIterator it(folder_path, QDir::Readable | QDir::Files | QDir::Dirs | QDir::NoDotAndDotDot, QDirIterator it(folder_path, QDir::Readable | QDir::Files | QDir::Dirs | QDir::NoDotAndDotDot,
QDirIterator::Subdirectories); QDirIterator::Subdirectories);
QVector<DocumentInfo> infos; std::list<DocumentInfo> infos;
while (it.hasNext()) { while (it.hasNext()) {
it.next(); it.next();
QFileInfo fileInfo = it.fileInfo(); QFileInfo fileInfo = it.fileInfo();
@ -1514,17 +1727,14 @@ void Database::scanDocuments(int folder_id, const QString &folder_path)
if (!m_scannedFileExtensions.contains(fileInfo.suffix(), Qt::CaseInsensitive)) if (!m_scannedFileExtensions.contains(fileInfo.suffix(), Qt::CaseInsensitive))
continue; continue;
DocumentInfo info; infos.push_back({ folder_id, fileInfo });
info.folder = folder_id;
info.doc = fileInfo;
infos.append(info);
} }
if (!infos.isEmpty()) { if (!infos.empty()) {
CollectionItem item = guiCollectionItem(folder_id); CollectionItem item = guiCollectionItem(folder_id);
item.indexing = true; item.indexing = true;
updateGuiForCollectionItem(item); updateGuiForCollectionItem(item);
enqueueDocuments(folder_id, infos); enqueueDocuments(folder_id, std::move(infos));
} else { } else {
updateFolderToIndex(folder_id, 0, false); updateFolderToIndex(folder_id, 0, false);
} }
@ -1535,7 +1745,7 @@ void Database::start()
connect(m_watcher, &QFileSystemWatcher::directoryChanged, this, &Database::directoryChanged); connect(m_watcher, &QFileSystemWatcher::directoryChanged, this, &Database::directoryChanged);
connect(m_embLLM, &EmbeddingLLM::embeddingsGenerated, this, &Database::handleEmbeddingsGenerated); connect(m_embLLM, &EmbeddingLLM::embeddingsGenerated, this, &Database::handleEmbeddingsGenerated);
connect(m_embLLM, &EmbeddingLLM::errorGenerated, this, &Database::handleErrorGenerated); connect(m_embLLM, &EmbeddingLLM::errorGenerated, this, &Database::handleErrorGenerated);
m_scanTimer->callOnTimeout(this, &Database::scanQueueBatch); m_scanIntervalTimer->callOnTimeout(this, &Database::scanQueueBatch);
const QString modelPath = MySettings::globalInstance()->modelPath(); const QString modelPath = MySettings::globalInstance()->modelPath();
QList<CollectionItem> oldCollections; QList<CollectionItem> oldCollections;

View File

@ -3,14 +3,15 @@
#include "embllm.h" // IWYU pragma: keep #include "embllm.h" // IWYU pragma: keep
#include <QByteArray>
#include <QChar>
#include <QDateTime> #include <QDateTime>
#include <QElapsedTimer>
#include <QFileInfo> #include <QFileInfo>
#include <QHash> #include <QHash>
#include <QLatin1String> #include <QLatin1String>
#include <QList> #include <QList>
#include <QMap>
#include <QObject> #include <QObject>
#include <QQueue>
#include <QSet> #include <QSet>
#include <QSqlDatabase> #include <QSqlDatabase>
#include <QString> #include <QString>
@ -18,13 +19,23 @@
#include <QThread> #include <QThread>
#include <QUrl> #include <QUrl>
#include <QVector> #include <QVector>
#include <QtGlobal>
#include <atomic>
#include <cstddef> #include <cstddef>
#include <list>
#include <map>
#include <memory>
#include <optional>
#include <utility>
#include <vector>
using namespace Qt::Literals::StringLiterals; using namespace Qt::Literals::StringLiterals;
class Database;
class DocumentReader;
class QFileSystemWatcher; class QFileSystemWatcher;
class QSqlError; class QSqlQuery;
class QTextStream; class QTextStream;
class QTimer; class QTimer;
@ -39,14 +50,16 @@ static const int LOCALDOCS_VERSION = 3;
struct DocumentInfo struct DocumentInfo
{ {
int folder; using key_type = std::pair<int, QString>;
QFileInfo doc;
int currentPage = 0; int folder;
size_t currentPosition = 0; QFileInfo file;
bool currentlyProcessing = false; bool currentlyProcessing = false;
bool isPdf() const {
return doc.suffix().compare(u"pdf"_s, Qt::CaseInsensitive) == 0; key_type key() const { return {folder, file.canonicalFilePath()}; } // for comparison
}
bool isPdf () const { return !file.suffix().compare("pdf"_L1, Qt::CaseInsensitive); }
bool isDocx() const { return !file.suffix().compare("docx"_L1, Qt::CaseInsensitive); }
}; };
struct ResultInfo { struct ResultInfo {
@ -141,6 +154,36 @@ struct CollectionItem {
}; };
Q_DECLARE_METATYPE(CollectionItem) Q_DECLARE_METATYPE(CollectionItem)
class ChunkStreamer {
public:
enum class Status { DOC_COMPLETE, INTERRUPTED, ERROR, BINARY_SEEN };
explicit ChunkStreamer(Database *database);
~ChunkStreamer();
void setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel, const QString &title,
const QString &author, const QString &subject, const QString &keywords);
Status step();
private:
Database *m_database;
std::optional<DocumentInfo::key_type> m_docKey;
std::unique_ptr<DocumentReader> m_reader; // may be invalid, always compare key first
int m_documentId;
QString m_embeddingModel;
QString m_title;
QString m_author;
QString m_subject;
QString m_keywords;
bool m_atStart;
// working state
QString m_chunk; // has a trailing space for convenience
int m_nChunkWords = 0;
int m_page = 0;
};
class Database : public QObject class Database : public QObject
{ {
Q_OBJECT Q_OBJECT
@ -152,6 +195,7 @@ public:
public Q_SLOTS: public Q_SLOTS:
void start(); void start();
bool scanQueueInterrupted() const;
void scanQueueBatch(); void scanQueueBatch();
void scanDocuments(int folder_id, const QString &folder_path); void scanDocuments(int folder_id, const QString &folder_path);
void forceIndexing(const QString &collection, const QString &embedding_model); void forceIndexing(const QString &collection, const QString &embedding_model);
@ -194,14 +238,12 @@ private:
void appendChunk(const EmbeddingChunk &chunk); void appendChunk(const EmbeddingChunk &chunk);
void sendChunkList(); void sendChunkList();
void updateFolderToIndex(int folder_id, size_t countForFolder, bool sendChunks = true); void updateFolderToIndex(int folder_id, size_t countForFolder, bool sendChunks = true);
void handleDocumentError(const QString &errorMessage,
int document_id, const QString &document_path, const QSqlError &error);
size_t countOfDocuments(int folder_id) const; size_t countOfDocuments(int folder_id) const;
size_t countOfBytes(int folder_id) const; size_t countOfBytes(int folder_id) const;
DocumentInfo dequeueDocument(); DocumentInfo dequeueDocument();
void removeFolderFromDocumentQueue(int folder_id); void removeFolderFromDocumentQueue(int folder_id);
void enqueueDocumentInternal(const DocumentInfo &info, bool prepend = false); void enqueueDocumentInternal(DocumentInfo &&info, bool prepend = false);
void enqueueDocuments(int folder_id, const QVector<DocumentInfo> &infos); void enqueueDocuments(int folder_id, std::list<DocumentInfo> &&infos);
void scanQueue(); void scanQueue();
bool cleanDB(); bool cleanDB();
void addFolderToWatch(const QString &path); void addFolderToWatch(const QString &path);
@ -240,8 +282,9 @@ private:
QSqlDatabase m_db; QSqlDatabase m_db;
int m_chunkSize; int m_chunkSize;
QStringList m_scannedFileExtensions; QStringList m_scannedFileExtensions;
QTimer *m_scanTimer; QTimer *m_scanIntervalTimer;
QMap<int, QQueue<DocumentInfo>> m_docsToScan; QElapsedTimer m_scanDurationTimer;
std::map<int, std::list<DocumentInfo>> m_docsToScan;
QList<ResultInfo> m_retrieve; QList<ResultInfo> m_retrieve;
QThread m_dbThread; QThread m_dbThread;
QFileSystemWatcher *m_watcher; QFileSystemWatcher *m_watcher;
@ -250,6 +293,9 @@ private:
QVector<EmbeddingChunk> m_chunkList; QVector<EmbeddingChunk> m_chunkList;
QHash<int, CollectionItem> m_collectionMap; // used only for tracking indexing/embedding progress QHash<int, CollectionItem> m_collectionMap; // used only for tracking indexing/embedding progress
std::atomic<bool> m_databaseValid; std::atomic<bool> m_databaseValid;
ChunkStreamer m_chunkStreamer;
friend class ChunkStreamer;
}; };
#endif // DATABASE_H #endif // DATABASE_H

View File

@ -55,7 +55,7 @@ static const QVariantMap basicDefaults {
{ "localdocs/chunkSize", 512 }, { "localdocs/chunkSize", 512 },
{ "localdocs/retrievalSize", 3 }, { "localdocs/retrievalSize", 3 },
{ "localdocs/showReferences", true }, { "localdocs/showReferences", true },
{ "localdocs/fileExtensions", QStringList { "txt", "pdf", "md", "rst" } }, { "localdocs/fileExtensions", QStringList { "docx", "pdf", "txt", "md", "rst" } },
{ "localdocs/useRemoteEmbed", false }, { "localdocs/useRemoteEmbed", false },
{ "localdocs/nomicAPIKey", "" }, { "localdocs/nomicAPIKey", "" },
{ "localdocs/embedDevice", "Auto" }, { "localdocs/embedDevice", "Auto" },

View File

@ -3,8 +3,8 @@
#include "chat.h" #include "chat.h"
#include "modellist.h" #include "modellist.h"
#include "mysettings.h" #include "mysettings.h"
#include "utils.h"
#include <fmt/base.h>
#include <fmt/format.h> #include <fmt/format.h>
#include <QByteArray> #include <QByteArray>
@ -25,9 +25,9 @@
#include <QVariant> #include <QVariant>
#include <Qt> #include <Qt>
#include <QtCborCommon> #include <QtCborCommon>
#include <QtGlobal>
#include <QtLogging> #include <QtLogging>
#include <algorithm>
#include <cstdint> #include <cstdint>
#include <iostream> #include <iostream>
#include <optional> #include <optional>
@ -37,26 +37,12 @@
#include <unordered_map> #include <unordered_map>
#include <utility> #include <utility>
namespace ranges = std::ranges;
using namespace std::string_literals; using namespace std::string_literals;
using namespace Qt::Literals::StringLiterals; using namespace Qt::Literals::StringLiterals;
//#define DEBUG //#define DEBUG
#define MAKE_FORMATTER(type, conversion) \
template <> \
struct fmt::formatter<type, char>: fmt::formatter<std::string, char> { \
template <typename FmtContext> \
FmtContext::iterator format(const type &value, FmtContext &ctx) const \
{ \
return formatter<std::string, char>::format(conversion, ctx); \
} \
}
MAKE_FORMATTER(QString, value.toStdString() );
MAKE_FORMATTER(QVariant, value.toString().toStdString());
namespace { namespace {
class InvalidRequestError: public std::invalid_argument { class InvalidRequestError: public std::invalid_argument {

25
gpt4all-chat/src/utils.h Normal file
View File

@ -0,0 +1,25 @@
#pragma once
#include <fmt/base.h>
#include <fmt/format.h>
#include <QString>
#include <QVariant>
#include <string>
// fmtlib formatters for QString and QVariant
#define MAKE_FORMATTER(type, conversion) \
template <> \
struct fmt::formatter<type, char>: fmt::formatter<std::string, char> { \
template <typename FmtContext> \
FmtContext::iterator format(const type &value, FmtContext &ctx) const \
{ \
return formatter<std::string, char>::format(conversion, ctx); \
} \
}
MAKE_FORMATTER(QString, value.toStdString() );
MAKE_FORMATTER(QVariant, value.toString().toStdString());