mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2024-10-01 01:06:10 -04:00
localdocs: implement .docx support (#2986)
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
parent
ea1ade8668
commit
e190fd0204
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -11,3 +11,6 @@
|
||||
[submodule "gpt4all-chat/deps/fmt"]
|
||||
path = gpt4all-chat/deps/fmt
|
||||
url = https://github.com/fmtlib/fmt.git
|
||||
[submodule "gpt4all-chat/deps/DuckX"]
|
||||
path = gpt4all-chat/deps/DuckX
|
||||
url = https://github.com/nomic-ai/DuckX.git
|
||||
|
@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
||||
|
||||
### Added
|
||||
- Add bm25 hybrid search to localdocs ([#2969](https://github.com/nomic-ai/gpt4all/pull/2969))
|
||||
- LocalDocs support for .docx files ([#2986](https://github.com/nomic-ai/gpt4all/pull/2986))
|
||||
|
||||
### Changed
|
||||
- Rebase llama.cpp on latest upstream as of September 26th ([#2998](https://github.com/nomic-ai/gpt4all/pull/2998))
|
||||
|
@ -88,12 +88,7 @@ message(STATUS "Qt 6 root directory: ${Qt6_ROOT_DIR}")
|
||||
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||
|
||||
set(FMT_INSTALL OFF)
|
||||
set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}")
|
||||
set(BUILD_SHARED_LIBS OFF)
|
||||
add_subdirectory(deps/fmt)
|
||||
set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}")
|
||||
|
||||
add_subdirectory(deps)
|
||||
add_subdirectory(../gpt4all-backend llmodel)
|
||||
|
||||
set(CHAT_EXE_RESOURCES)
|
||||
@ -133,9 +128,6 @@ if (APPLE)
|
||||
list(APPEND CHAT_EXE_RESOURCES "${LOCAL_EMBEDDING_MODEL_PATH}")
|
||||
endif()
|
||||
|
||||
set(QAPPLICATION_CLASS QGuiApplication)
|
||||
add_subdirectory(deps/SingleApplication)
|
||||
|
||||
if (DEFINED GGML_METALLIB)
|
||||
set_source_files_properties("${GGML_METALLIB}" PROPERTIES GENERATED ON)
|
||||
endif()
|
||||
@ -335,7 +327,7 @@ target_include_directories(chat PRIVATE deps/usearch/include
|
||||
target_link_libraries(chat
|
||||
PRIVATE Qt6::Core Qt6::HttpServer Qt6::Pdf Qt6::Quick Qt6::Sql Qt6::Svg)
|
||||
target_link_libraries(chat
|
||||
PRIVATE llmodel SingleApplication fmt::fmt)
|
||||
PRIVATE llmodel SingleApplication fmt::fmt duckx::duckx)
|
||||
|
||||
|
||||
# -- install --
|
||||
|
10
gpt4all-chat/deps/CMakeLists.txt
Normal file
10
gpt4all-chat/deps/CMakeLists.txt
Normal file
@ -0,0 +1,10 @@
|
||||
set(BUILD_SHARED_LIBS OFF)
|
||||
|
||||
set(FMT_INSTALL OFF)
|
||||
add_subdirectory(fmt)
|
||||
|
||||
set(QAPPLICATION_CLASS QGuiApplication)
|
||||
add_subdirectory(SingleApplication)
|
||||
|
||||
set(DUCKX_INSTALL OFF)
|
||||
add_subdirectory(DuckX)
|
1
gpt4all-chat/deps/DuckX
Submodule
1
gpt4all-chat/deps/DuckX
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 6e31dfb280e2107fbf4f6a15098c38b014f1bbcc
|
@ -1 +1 @@
|
||||
Subproject commit 22cfa3bd00ea542132ee826cdb220f9d6434bd43
|
||||
Subproject commit 1f0618a86f9dbb7386237241cee96cc425dd7b55
|
@ -70,7 +70,7 @@ MySettingsTab {
|
||||
/* Blacklist common unsupported file extensions. We only support plain text and PDFs, and although we
|
||||
* reject binary data, we don't want to waste time trying to index files that we don't support. */
|
||||
exts = exts.filter(e => ![
|
||||
/* Microsoft documents */ "rtf", "docx", "ppt", "pptx", "xls", "xlsx",
|
||||
/* Microsoft documents */ "rtf", "ppt", "pptx", "xls", "xlsx",
|
||||
/* OpenOffice */ "odt", "ods", "odp", "odg",
|
||||
/* photos */ "jpg", "jpeg", "png", "gif", "bmp", "tif", "tiff", "webp",
|
||||
/* audio */ "mp3", "wma", "m4a", "wav", "flac",
|
||||
|
@ -1,13 +1,15 @@
|
||||
#include "database.h"
|
||||
|
||||
#include "mysettings.h"
|
||||
#include "utils.h"
|
||||
|
||||
#include <duckx/duckx.hpp>
|
||||
#include <fmt/format.h>
|
||||
#include <usearch/index_plugins.hpp>
|
||||
|
||||
#include <QDebug>
|
||||
#include <QDir>
|
||||
#include <QDirIterator>
|
||||
#include <QElapsedTimer>
|
||||
#include <QFile>
|
||||
#include <QFileSystemWatcher>
|
||||
#include <QIODevice>
|
||||
@ -18,16 +20,16 @@
|
||||
#include <QSqlQuery>
|
||||
#include <QTextStream>
|
||||
#include <QTimer>
|
||||
#include <QMap>
|
||||
#include <QUtf8StringView>
|
||||
#include <QVariant>
|
||||
#include <Qt>
|
||||
#include <QtGlobal>
|
||||
#include <QtLogging>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <optional>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <stdexcept>
|
||||
|
||||
using namespace Qt::Literals::StringLiterals;
|
||||
namespace us = unum::usearch;
|
||||
@ -991,10 +993,11 @@ Database::Database(int chunkSize, QStringList extensions)
|
||||
: QObject(nullptr)
|
||||
, m_chunkSize(chunkSize)
|
||||
, m_scannedFileExtensions(std::move(extensions))
|
||||
, m_scanTimer(new QTimer(this))
|
||||
, m_scanIntervalTimer(new QTimer(this))
|
||||
, m_watcher(new QFileSystemWatcher(this))
|
||||
, m_embLLM(new EmbeddingLLM)
|
||||
, m_databaseValid(true)
|
||||
, m_chunkStreamer(this)
|
||||
{
|
||||
m_db = QSqlDatabase::database(QSqlDatabase::defaultConnection, false);
|
||||
if (!m_db.isValid())
|
||||
@ -1080,87 +1083,345 @@ void Database::updateFolderToIndex(int folder_id, size_t countForFolder, bool se
|
||||
updateGuiForCollectionItem(item);
|
||||
}
|
||||
|
||||
void Database::handleDocumentError(const QString &errorMessage,
|
||||
int document_id, const QString &document_path, const QSqlError &error)
|
||||
static void handleDocumentError(const QString &errorMessage, int document_id, const QString &document_path,
|
||||
const QSqlError &error)
|
||||
{
|
||||
qWarning() << errorMessage << document_id << document_path << error;
|
||||
}
|
||||
|
||||
size_t Database::chunkStream(QTextStream &stream, int folder_id, int document_id, const QString &embedding_model,
|
||||
const QString &file, const QString &title, const QString &author, const QString &subject, const QString &keywords,
|
||||
int page, int maxChunks)
|
||||
class DocumentReader {
|
||||
public:
|
||||
static std::unique_ptr<DocumentReader> fromDocument(const DocumentInfo &info);
|
||||
|
||||
const DocumentInfo &doc () const { return *m_info; }
|
||||
const std::optional<QString> &word () const { return m_word; }
|
||||
const std::optional<QString> &nextWord() { m_word = advance(); return m_word; }
|
||||
virtual std::optional<ChunkStreamer::Status> getError() const { return std::nullopt; }
|
||||
virtual int page() const { return -1; }
|
||||
|
||||
virtual ~DocumentReader() = default;
|
||||
|
||||
protected:
|
||||
explicit DocumentReader(const DocumentInfo &info)
|
||||
: m_info(&info) {}
|
||||
|
||||
void postInit() { m_word = advance(); }
|
||||
|
||||
virtual std::optional<QString> advance() = 0;
|
||||
|
||||
const DocumentInfo *m_info;
|
||||
std::optional<QString> m_word;
|
||||
};
|
||||
|
||||
namespace {
|
||||
|
||||
class PdfDocumentReader final : public DocumentReader {
|
||||
public:
|
||||
explicit PdfDocumentReader(const DocumentInfo &info)
|
||||
: DocumentReader(info)
|
||||
{
|
||||
QString path = info.file.canonicalFilePath();
|
||||
if (m_doc.load(path) != QPdfDocument::Error::None)
|
||||
throw std::runtime_error(fmt::format("Failed to load PDF: {}", path));
|
||||
postInit();
|
||||
}
|
||||
|
||||
int page() const override { return m_currentPage; }
|
||||
|
||||
private:
|
||||
std::optional<QString> advance() override
|
||||
{
|
||||
QString word;
|
||||
do {
|
||||
while (!m_stream || m_stream->atEnd()) {
|
||||
if (m_currentPage >= m_doc.pageCount())
|
||||
return std::nullopt;
|
||||
m_pageText = m_doc.getAllText(m_currentPage++).text();
|
||||
m_stream.emplace(&m_pageText);
|
||||
}
|
||||
*m_stream >> word;
|
||||
} while (word.isEmpty());
|
||||
return word;
|
||||
}
|
||||
|
||||
QPdfDocument m_doc;
|
||||
int m_currentPage = 0;
|
||||
QString m_pageText;
|
||||
std::optional<QTextStream> m_stream;
|
||||
};
|
||||
|
||||
class WordDocumentReader final : public DocumentReader {
|
||||
public:
|
||||
explicit WordDocumentReader(const DocumentInfo &info)
|
||||
: DocumentReader(info)
|
||||
, m_doc(info.file.canonicalFilePath().toStdString())
|
||||
{
|
||||
m_doc.open();
|
||||
if (!m_doc.is_open())
|
||||
throw std::runtime_error(fmt::format("Failed to open DOCX: {}", info.file.canonicalFilePath()));
|
||||
|
||||
m_paragraph = &m_doc.paragraphs();
|
||||
m_run = &m_paragraph->runs();
|
||||
postInit();
|
||||
}
|
||||
|
||||
protected:
|
||||
std::optional<QString> advance() override
|
||||
{
|
||||
// find non-space char
|
||||
qsizetype wordStart = 0;
|
||||
while (m_buffer.isEmpty() || m_buffer[wordStart].isSpace()) {
|
||||
if (m_buffer.isEmpty() && !fillBuffer())
|
||||
return std::nullopt;
|
||||
if (m_buffer[wordStart].isSpace() && ++wordStart >= m_buffer.size()) {
|
||||
m_buffer.clear();
|
||||
wordStart = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// find space char
|
||||
qsizetype wordEnd = wordStart + 1;
|
||||
while (wordEnd >= m_buffer.size() || !m_buffer[wordEnd].isSpace()) {
|
||||
if (wordEnd >= m_buffer.size() && !fillBuffer())
|
||||
return std::nullopt;
|
||||
if (!m_buffer[wordEnd].isSpace())
|
||||
++wordEnd;
|
||||
}
|
||||
|
||||
auto size = wordEnd - wordStart;
|
||||
QString word = std::move(m_buffer);
|
||||
m_buffer = word.sliced(wordStart + size);
|
||||
if (wordStart == 0)
|
||||
word.resize(size);
|
||||
else
|
||||
word = word.sliced(wordStart, size);
|
||||
|
||||
return word;
|
||||
}
|
||||
|
||||
bool fillBuffer()
|
||||
{
|
||||
for (;;) {
|
||||
// get a run
|
||||
while (!m_run->has_next()) {
|
||||
// try next paragraph
|
||||
if (!m_paragraph->has_next())
|
||||
return false;
|
||||
m_paragraph->next();
|
||||
m_buffer += u'\n';
|
||||
}
|
||||
auto &run = m_run->get_node();
|
||||
const char *text = run.child("w:t").text().get();
|
||||
if (!*text && run.child("w:tab"))
|
||||
text = "\t";
|
||||
m_run->next();
|
||||
if (*text) {
|
||||
m_buffer += QUtf8StringView(text);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
duckx::Document m_doc;
|
||||
duckx::Paragraph *m_paragraph;
|
||||
duckx::Run *m_run;
|
||||
QString m_buffer;
|
||||
};
|
||||
|
||||
class TxtDocumentReader final : public DocumentReader {
|
||||
public:
|
||||
explicit TxtDocumentReader(const DocumentInfo &info)
|
||||
: DocumentReader(info)
|
||||
, m_file(info.file.canonicalFilePath())
|
||||
{
|
||||
if (!m_file.open(QIODevice::ReadOnly))
|
||||
throw std::runtime_error(fmt::format("Failed to open text file: {}", m_file.fileName()));
|
||||
|
||||
m_stream.setDevice(&m_file);
|
||||
postInit();
|
||||
}
|
||||
|
||||
protected:
|
||||
std::optional<QString> advance() override
|
||||
{
|
||||
while (!m_stream.atEnd()) {
|
||||
QString word;
|
||||
m_stream >> word;
|
||||
if (!word.isEmpty())
|
||||
return word;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::optional<ChunkStreamer::Status> getError() const override
|
||||
{
|
||||
if (!m_file.error())
|
||||
return std::nullopt;
|
||||
return m_file.binarySeen() ? ChunkStreamer::Status::BINARY_SEEN : ChunkStreamer::Status::ERROR;
|
||||
}
|
||||
|
||||
BinaryDetectingFile m_file;
|
||||
QTextStream m_stream;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
std::unique_ptr<DocumentReader> DocumentReader::fromDocument(const DocumentInfo &doc)
|
||||
{
|
||||
if (doc.isPdf())
|
||||
return std::make_unique<PdfDocumentReader>(doc);
|
||||
if (doc.isDocx())
|
||||
return std::make_unique<WordDocumentReader>(doc);
|
||||
return std::make_unique<TxtDocumentReader>(doc);
|
||||
}
|
||||
|
||||
ChunkStreamer::ChunkStreamer(Database *database)
|
||||
: m_database(database) {}
|
||||
|
||||
ChunkStreamer::~ChunkStreamer() = default;
|
||||
|
||||
void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel,
|
||||
const QString &title, const QString &author, const QString &subject,
|
||||
const QString &keywords)
|
||||
{
|
||||
auto docKey = doc.key();
|
||||
if (!m_docKey || *m_docKey != docKey) {
|
||||
m_docKey = docKey;
|
||||
m_reader = DocumentReader::fromDocument(doc);
|
||||
m_documentId = documentId;
|
||||
m_embeddingModel = embeddingModel;
|
||||
m_title = title;
|
||||
m_author = author;
|
||||
m_subject = subject;
|
||||
m_keywords = keywords;
|
||||
m_chunk.clear();
|
||||
m_page = 0;
|
||||
|
||||
// make sure the document doesn't already have any chunks
|
||||
QSqlQuery q(m_database->m_db);
|
||||
if (!removeChunksByDocumentId(q, documentId))
|
||||
handleDocumentError("ERROR: Cannot remove chunks of document", documentId, doc.file.canonicalPath(), q.lastError());
|
||||
}
|
||||
}
|
||||
|
||||
ChunkStreamer::Status ChunkStreamer::step()
|
||||
{
|
||||
int charCount = 0;
|
||||
// TODO: implement line_from/line_to
|
||||
constexpr int line_from = -1;
|
||||
constexpr int line_to = -1;
|
||||
QList<QString> words;
|
||||
int chunks = 0;
|
||||
int addedWords = 0;
|
||||
const int folderId = m_reader->doc().folder;
|
||||
const int maxChunkSize = m_database->m_chunkSize;
|
||||
int nChunks = 0;
|
||||
int nAddedWords = 0;
|
||||
Status retval;
|
||||
|
||||
for (;;) {
|
||||
QString word;
|
||||
stream >> word;
|
||||
if (stream.status() && !stream.atEnd())
|
||||
return -1;
|
||||
charCount += word.length();
|
||||
if (!word.isEmpty())
|
||||
words.append(word);
|
||||
if (stream.status() || charCount + words.size() - 1 >= m_chunkSize) {
|
||||
if (!words.isEmpty()) {
|
||||
const QString chunk = words.join(" ");
|
||||
QSqlQuery q(m_db);
|
||||
int chunk_id = 0;
|
||||
if (auto error = m_reader->getError())
|
||||
return *error;
|
||||
if (m_database->scanQueueInterrupted()) {
|
||||
retval = Status::INTERRUPTED;
|
||||
break;
|
||||
}
|
||||
|
||||
// get a word, if needed
|
||||
std::optional<QString> word = QString(); // empty string to disable EOF logic
|
||||
if (m_chunk.length() < maxChunkSize + 1) {
|
||||
word = m_reader->word();
|
||||
if (m_chunk.isEmpty())
|
||||
m_page = m_reader->page(); // page number of first word
|
||||
|
||||
if (word) {
|
||||
m_chunk += *word;
|
||||
m_chunk += u' ';
|
||||
m_reader->nextWord();
|
||||
m_nChunkWords++;
|
||||
}
|
||||
}
|
||||
|
||||
if (!word || m_chunk.length() >= maxChunkSize + 1) { // +1 for leading space
|
||||
if (!m_chunk.isEmpty()) {
|
||||
int nThisChunkWords = 0;
|
||||
auto chunk = m_chunk; // copy
|
||||
|
||||
// handle overlength chunks
|
||||
if (m_chunk.length() > maxChunkSize + 1) {
|
||||
// find the final space
|
||||
qsizetype lastSpace = chunk.lastIndexOf(u' ', -2);
|
||||
|
||||
if (lastSpace < 0) {
|
||||
// slice off the last word
|
||||
Q_ASSERT(m_nChunkWords >= 1);
|
||||
lastSpace = maxChunkSize;
|
||||
nThisChunkWords = m_nChunkWords - 1;
|
||||
m_nChunkWords = 1;
|
||||
} else {
|
||||
// slice the overlong word
|
||||
nThisChunkWords = m_nChunkWords;
|
||||
m_nChunkWords = 0;
|
||||
}
|
||||
// save the extra part
|
||||
m_chunk = chunk.sliced(lastSpace + 1);
|
||||
// slice
|
||||
chunk.truncate(lastSpace + 1);
|
||||
Q_ASSERT(chunk.length() <= maxChunkSize + 1);
|
||||
} else {
|
||||
nThisChunkWords = m_nChunkWords;
|
||||
m_nChunkWords = 0;
|
||||
}
|
||||
|
||||
QSqlQuery q(m_database->m_db);
|
||||
int chunkId = 0;
|
||||
if (!addChunk(q,
|
||||
document_id,
|
||||
chunk,
|
||||
file,
|
||||
title,
|
||||
author,
|
||||
subject,
|
||||
keywords,
|
||||
page,
|
||||
m_documentId,
|
||||
chunk.chopped(1), // strip trailing space
|
||||
m_reader->doc().file.canonicalFilePath(),
|
||||
m_title,
|
||||
m_author,
|
||||
m_subject,
|
||||
m_keywords,
|
||||
m_page,
|
||||
line_from,
|
||||
line_to,
|
||||
words.size(),
|
||||
&chunk_id
|
||||
nThisChunkWords,
|
||||
&chunkId
|
||||
)) {
|
||||
qWarning() << "ERROR: Could not insert chunk into db" << q.lastError();
|
||||
}
|
||||
|
||||
addedWords += words.size();
|
||||
nAddedWords += nThisChunkWords;
|
||||
|
||||
EmbeddingChunk toEmbed;
|
||||
toEmbed.model = embedding_model;
|
||||
toEmbed.folder_id = folder_id;
|
||||
toEmbed.chunk_id = chunk_id;
|
||||
toEmbed.model = m_embeddingModel;
|
||||
toEmbed.folder_id = folderId;
|
||||
toEmbed.chunk_id = chunkId;
|
||||
toEmbed.chunk = chunk;
|
||||
appendChunk(toEmbed);
|
||||
++chunks;
|
||||
m_database->appendChunk(toEmbed);
|
||||
++nChunks;
|
||||
|
||||
words.clear();
|
||||
charCount = 0;
|
||||
m_chunk.clear();
|
||||
}
|
||||
|
||||
if (stream.status() || (maxChunks > 0 && chunks == maxChunks))
|
||||
if (!word) {
|
||||
retval = Status::DOC_COMPLETE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (chunks) {
|
||||
CollectionItem item = guiCollectionItem(folder_id);
|
||||
if (nChunks) {
|
||||
CollectionItem item = m_database->guiCollectionItem(folderId);
|
||||
|
||||
// Set the start update if we haven't done so already
|
||||
if (item.startUpdate <= item.lastUpdate && item.currentEmbeddingsToIndex == 0)
|
||||
setStartUpdateTime(item);
|
||||
m_database->setStartUpdateTime(item);
|
||||
|
||||
item.currentEmbeddingsToIndex += chunks;
|
||||
item.totalEmbeddingsToIndex += chunks;
|
||||
item.totalWords += addedWords;
|
||||
updateGuiForCollectionItem(item);
|
||||
item.currentEmbeddingsToIndex += nChunks;
|
||||
item.totalEmbeddingsToIndex += nChunks;
|
||||
item.totalWords += nAddedWords;
|
||||
m_database->updateGuiForCollectionItem(item);
|
||||
}
|
||||
|
||||
return stream.pos();
|
||||
return retval;
|
||||
}
|
||||
|
||||
void Database::appendChunk(const EmbeddingChunk &chunk)
|
||||
@ -1238,83 +1499,82 @@ void Database::handleErrorGenerated(const QVector<EmbeddingChunk> &chunks, const
|
||||
|
||||
size_t Database::countOfDocuments(int folder_id) const
|
||||
{
|
||||
if (!m_docsToScan.contains(folder_id))
|
||||
if (auto it = m_docsToScan.find(folder_id); it != m_docsToScan.end())
|
||||
return it->second.size();
|
||||
return 0;
|
||||
return m_docsToScan.value(folder_id).size();
|
||||
}
|
||||
|
||||
size_t Database::countOfBytes(int folder_id) const
|
||||
{
|
||||
if (!m_docsToScan.contains(folder_id))
|
||||
return 0;
|
||||
if (auto it = m_docsToScan.find(folder_id); it != m_docsToScan.end()) {
|
||||
size_t totalBytes = 0;
|
||||
const QQueue<DocumentInfo> &docs = m_docsToScan.value(folder_id);
|
||||
for (const DocumentInfo &f : docs)
|
||||
totalBytes += f.doc.size();
|
||||
for (const DocumentInfo &f : it->second)
|
||||
totalBytes += f.file.size();
|
||||
return totalBytes;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
DocumentInfo Database::dequeueDocument()
|
||||
{
|
||||
Q_ASSERT(!m_docsToScan.isEmpty());
|
||||
const int firstKey = m_docsToScan.firstKey();
|
||||
QQueue<DocumentInfo> &queue = m_docsToScan[firstKey];
|
||||
Q_ASSERT(!queue.isEmpty());
|
||||
DocumentInfo result = queue.dequeue();
|
||||
if (queue.isEmpty())
|
||||
m_docsToScan.remove(firstKey);
|
||||
Q_ASSERT(!m_docsToScan.empty());
|
||||
auto firstEntry = m_docsToScan.begin();
|
||||
auto &[firstKey, queue] = *firstEntry;
|
||||
Q_ASSERT(!queue.empty());
|
||||
DocumentInfo result = std::move(queue.front());
|
||||
queue.pop_front();
|
||||
if (queue.empty())
|
||||
m_docsToScan.erase(firstEntry);
|
||||
return result;
|
||||
}
|
||||
|
||||
void Database::removeFolderFromDocumentQueue(int folder_id)
|
||||
{
|
||||
if (!m_docsToScan.contains(folder_id))
|
||||
return;
|
||||
m_docsToScan.remove(folder_id);
|
||||
if (auto it = m_docsToScan.find(folder_id); it != m_docsToScan.end())
|
||||
m_docsToScan.erase(it);
|
||||
}
|
||||
|
||||
void Database::enqueueDocumentInternal(const DocumentInfo &info, bool prepend)
|
||||
void Database::enqueueDocumentInternal(DocumentInfo &&info, bool prepend)
|
||||
{
|
||||
const int key = info.folder;
|
||||
if (!m_docsToScan.contains(key))
|
||||
m_docsToScan[key] = QQueue<DocumentInfo>();
|
||||
if (prepend)
|
||||
m_docsToScan[key].prepend(info);
|
||||
else
|
||||
m_docsToScan[key].enqueue(info);
|
||||
auto &queue = m_docsToScan[info.folder];
|
||||
queue.insert(prepend ? queue.begin() : queue.end(), std::move(info));
|
||||
}
|
||||
|
||||
void Database::enqueueDocuments(int folder_id, const QVector<DocumentInfo> &infos)
|
||||
void Database::enqueueDocuments(int folder_id, std::list<DocumentInfo> &&infos)
|
||||
{
|
||||
for (int i = 0; i < infos.size(); ++i)
|
||||
enqueueDocumentInternal(infos[i]);
|
||||
const size_t count = countOfDocuments(folder_id);
|
||||
// enqueue all documents
|
||||
auto &queue = m_docsToScan[folder_id];
|
||||
queue.splice(queue.end(), std::move(infos));
|
||||
|
||||
CollectionItem item = guiCollectionItem(folder_id);
|
||||
item.currentDocsToIndex = count;
|
||||
item.totalDocsToIndex = count;
|
||||
item.currentDocsToIndex = queue.size();
|
||||
item.totalDocsToIndex = queue.size();
|
||||
const size_t bytes = countOfBytes(folder_id);
|
||||
item.currentBytesToIndex = bytes;
|
||||
item.totalBytesToIndex = bytes;
|
||||
updateGuiForCollectionItem(item);
|
||||
m_scanTimer->start();
|
||||
m_scanIntervalTimer->start();
|
||||
}
|
||||
|
||||
bool Database::scanQueueInterrupted() const
|
||||
{
|
||||
return m_scanDurationTimer.elapsed() >= 100;
|
||||
}
|
||||
|
||||
void Database::scanQueueBatch()
|
||||
{
|
||||
QElapsedTimer timer;
|
||||
timer.start();
|
||||
m_scanDurationTimer.start();
|
||||
|
||||
transaction();
|
||||
|
||||
// scan for up to 100ms or until we run out of documents
|
||||
while (!m_docsToScan.isEmpty() && timer.elapsed() < 100)
|
||||
while (!m_docsToScan.empty() && !scanQueueInterrupted())
|
||||
scanQueue();
|
||||
|
||||
commit();
|
||||
|
||||
if (m_docsToScan.isEmpty())
|
||||
m_scanTimer->stop();
|
||||
if (m_docsToScan.empty())
|
||||
m_scanIntervalTimer->stop();
|
||||
}
|
||||
|
||||
void Database::scanQueue()
|
||||
@ -1324,15 +1584,15 @@ void Database::scanQueue()
|
||||
const int folder_id = info.folder;
|
||||
|
||||
// Update info
|
||||
info.doc.stat();
|
||||
info.file.stat();
|
||||
|
||||
// If the doc has since been deleted or no longer readable, then we schedule more work and return
|
||||
// leaving the cleanup for the cleanup handler
|
||||
if (!info.doc.exists() || !info.doc.isReadable())
|
||||
if (!info.file.exists() || !info.file.isReadable())
|
||||
return updateFolderToIndex(folder_id, countForFolder);
|
||||
|
||||
const qint64 document_time = info.doc.fileTime(QFile::FileModificationTime).toMSecsSinceEpoch();
|
||||
const QString document_path = info.doc.canonicalFilePath();
|
||||
const qint64 document_time = info.file.fileTime(QFile::FileModificationTime).toMSecsSinceEpoch();
|
||||
const QString document_path = info.file.canonicalFilePath();
|
||||
const bool currentlyProcessing = info.currentlyProcessing;
|
||||
|
||||
// Check and see if we already have this document
|
||||
@ -1393,104 +1653,57 @@ void Database::scanQueue()
|
||||
}
|
||||
|
||||
Q_ASSERT(document_id != -1);
|
||||
|
||||
{
|
||||
QString title, author, subject, keywords;
|
||||
if (info.isPdf()) {
|
||||
QPdfDocument doc;
|
||||
if (QPdfDocument::Error::None != doc.load(info.doc.canonicalFilePath())) {
|
||||
handleDocumentError("ERROR: Could not load pdf",
|
||||
document_id, document_path, q.lastError());
|
||||
if (doc.load(document_path) != QPdfDocument::Error::None) {
|
||||
qWarning() << "ERROR: Could not load pdf" << document_id << document_path;;
|
||||
return updateFolderToIndex(folder_id, countForFolder);
|
||||
}
|
||||
const size_t bytes = info.doc.size();
|
||||
const size_t bytesPerPage = std::floor(bytes / doc.pageCount());
|
||||
const int pageIndex = info.currentPage;
|
||||
#if defined(DEBUG)
|
||||
qDebug() << "scanning page" << pageIndex << "of" << doc.pageCount() << document_path;
|
||||
#endif
|
||||
const QPdfSelection selection = doc.getAllText(pageIndex);
|
||||
QString text = selection.text();
|
||||
QTextStream stream(&text);
|
||||
chunkStream(stream, info.folder, document_id, embedding_model, info.doc.fileName(),
|
||||
doc.metaData(QPdfDocument::MetaDataField::Title).toString(),
|
||||
doc.metaData(QPdfDocument::MetaDataField::Author).toString(),
|
||||
doc.metaData(QPdfDocument::MetaDataField::Subject).toString(),
|
||||
doc.metaData(QPdfDocument::MetaDataField::Keywords).toString(),
|
||||
pageIndex + 1
|
||||
);
|
||||
CollectionItem item = guiCollectionItem(info.folder);
|
||||
item.currentBytesToIndex -= bytesPerPage;
|
||||
updateGuiForCollectionItem(item);
|
||||
if (info.currentPage < doc.pageCount()) {
|
||||
info.currentPage += 1;
|
||||
title = doc.metaData(QPdfDocument::MetaDataField::Title).toString();
|
||||
author = doc.metaData(QPdfDocument::MetaDataField::Author).toString();
|
||||
subject = doc.metaData(QPdfDocument::MetaDataField::Subject).toString();
|
||||
keywords = doc.metaData(QPdfDocument::MetaDataField::Keywords).toString();
|
||||
// TODO(jared): metadata for Word documents?
|
||||
}
|
||||
|
||||
try {
|
||||
m_chunkStreamer.setDocument(info, document_id, embedding_model, title, author, subject, keywords);
|
||||
} catch (const std::runtime_error &e) {
|
||||
qWarning() << "LocalDocs ERROR:" << e.what();
|
||||
goto dequeue;
|
||||
}
|
||||
}
|
||||
|
||||
switch (m_chunkStreamer.step()) {
|
||||
case ChunkStreamer::Status::INTERRUPTED:
|
||||
info.currentlyProcessing = true;
|
||||
enqueueDocumentInternal(info, true /*prepend*/);
|
||||
enqueueDocumentInternal(std::move(info), /*prepend*/ true);
|
||||
return updateFolderToIndex(folder_id, countForFolder + 1);
|
||||
}
|
||||
|
||||
item.currentBytesToIndex -= bytes - (bytesPerPage * doc.pageCount());
|
||||
updateGuiForCollectionItem(item);
|
||||
} else {
|
||||
BinaryDetectingFile file(document_path);
|
||||
if (!file.open(QIODevice::ReadOnly)) {
|
||||
handleDocumentError("ERROR: Cannot open file for scanning",
|
||||
existing_id, document_path, q.lastError());
|
||||
return updateFolderToIndex(folder_id, countForFolder);
|
||||
}
|
||||
Q_ASSERT(!file.isSequential()); // we need to seek
|
||||
|
||||
const size_t bytes = info.doc.size();
|
||||
QTextStream stream(&file);
|
||||
const size_t byteIndex = info.currentPosition;
|
||||
if (byteIndex) {
|
||||
/* Read the Unicode BOM to detect the encoding. Without this, QTextStream will
|
||||
* always interpret the text as UTF-8 when byteIndex is nonzero. */
|
||||
stream.read(1);
|
||||
|
||||
if (!stream.seek(byteIndex)) {
|
||||
handleDocumentError("ERROR: Cannot seek to pos for scanning",
|
||||
existing_id, document_path, q.lastError());
|
||||
return updateFolderToIndex(folder_id, countForFolder);
|
||||
}
|
||||
}
|
||||
#if defined(DEBUG)
|
||||
qDebug() << "scanning byteIndex" << byteIndex << "of" << bytes << document_path;
|
||||
#endif
|
||||
int pos = chunkStream(stream, info.folder, document_id, embedding_model, info.doc.fileName(),
|
||||
QString() /*title*/, QString() /*author*/, QString() /*subject*/, QString() /*keywords*/, -1 /*page*/,
|
||||
100 /*maxChunks*/);
|
||||
if (pos < 0) {
|
||||
if (!file.binarySeen()) {
|
||||
handleDocumentError(u"ERROR: Failed to read file (status %1)"_s.arg(stream.status()),
|
||||
existing_id, document_path, q.lastError());
|
||||
return updateFolderToIndex(folder_id, countForFolder);
|
||||
}
|
||||
|
||||
case ChunkStreamer::Status::BINARY_SEEN:
|
||||
/* When we see a binary file, we treat it like an empty file so we know not to
|
||||
* scan it again. All existing chunks are removed, and in-progress embeddings
|
||||
* are ignored when they complete. */
|
||||
|
||||
qInfo() << "LocalDocs: Ignoring file with binary data:" << document_path;
|
||||
|
||||
// this will also ensure in-flight embeddings are ignored
|
||||
if (!removeChunksByDocumentId(q, existing_id)) {
|
||||
handleDocumentError("ERROR: Cannot remove chunks of document",
|
||||
existing_id, document_path, q.lastError());
|
||||
}
|
||||
if (!removeChunksByDocumentId(q, existing_id))
|
||||
handleDocumentError("ERROR: Cannot remove chunks of document", existing_id, document_path, q.lastError());
|
||||
updateCollectionStatistics();
|
||||
return updateFolderToIndex(folder_id, countForFolder);
|
||||
}
|
||||
file.close();
|
||||
const size_t bytesChunked = pos - byteIndex;
|
||||
CollectionItem item = guiCollectionItem(info.folder);
|
||||
item.currentBytesToIndex -= bytesChunked;
|
||||
updateGuiForCollectionItem(item);
|
||||
if (info.currentPosition < bytes) {
|
||||
info.currentPosition = pos;
|
||||
info.currentlyProcessing = true;
|
||||
enqueueDocumentInternal(info, true /*prepend*/);
|
||||
return updateFolderToIndex(folder_id, countForFolder + 1);
|
||||
}
|
||||
break;
|
||||
case ChunkStreamer::Status::ERROR:
|
||||
qWarning() << "error reading" << document_path;
|
||||
break;
|
||||
case ChunkStreamer::Status::DOC_COMPLETE:
|
||||
;
|
||||
}
|
||||
|
||||
dequeue:
|
||||
auto item = guiCollectionItem(folder_id);
|
||||
item.currentBytesToIndex -= info.file.size();
|
||||
updateGuiForCollectionItem(item);
|
||||
return updateFolderToIndex(folder_id, countForFolder);
|
||||
}
|
||||
|
||||
@ -1502,7 +1715,7 @@ void Database::scanDocuments(int folder_id, const QString &folder_path)
|
||||
|
||||
QDirIterator it(folder_path, QDir::Readable | QDir::Files | QDir::Dirs | QDir::NoDotAndDotDot,
|
||||
QDirIterator::Subdirectories);
|
||||
QVector<DocumentInfo> infos;
|
||||
std::list<DocumentInfo> infos;
|
||||
while (it.hasNext()) {
|
||||
it.next();
|
||||
QFileInfo fileInfo = it.fileInfo();
|
||||
@ -1514,17 +1727,14 @@ void Database::scanDocuments(int folder_id, const QString &folder_path)
|
||||
if (!m_scannedFileExtensions.contains(fileInfo.suffix(), Qt::CaseInsensitive))
|
||||
continue;
|
||||
|
||||
DocumentInfo info;
|
||||
info.folder = folder_id;
|
||||
info.doc = fileInfo;
|
||||
infos.append(info);
|
||||
infos.push_back({ folder_id, fileInfo });
|
||||
}
|
||||
|
||||
if (!infos.isEmpty()) {
|
||||
if (!infos.empty()) {
|
||||
CollectionItem item = guiCollectionItem(folder_id);
|
||||
item.indexing = true;
|
||||
updateGuiForCollectionItem(item);
|
||||
enqueueDocuments(folder_id, infos);
|
||||
enqueueDocuments(folder_id, std::move(infos));
|
||||
} else {
|
||||
updateFolderToIndex(folder_id, 0, false);
|
||||
}
|
||||
@ -1535,7 +1745,7 @@ void Database::start()
|
||||
connect(m_watcher, &QFileSystemWatcher::directoryChanged, this, &Database::directoryChanged);
|
||||
connect(m_embLLM, &EmbeddingLLM::embeddingsGenerated, this, &Database::handleEmbeddingsGenerated);
|
||||
connect(m_embLLM, &EmbeddingLLM::errorGenerated, this, &Database::handleErrorGenerated);
|
||||
m_scanTimer->callOnTimeout(this, &Database::scanQueueBatch);
|
||||
m_scanIntervalTimer->callOnTimeout(this, &Database::scanQueueBatch);
|
||||
|
||||
const QString modelPath = MySettings::globalInstance()->modelPath();
|
||||
QList<CollectionItem> oldCollections;
|
||||
|
@ -3,14 +3,15 @@
|
||||
|
||||
#include "embllm.h" // IWYU pragma: keep
|
||||
|
||||
#include <QByteArray>
|
||||
#include <QChar>
|
||||
#include <QDateTime>
|
||||
#include <QElapsedTimer>
|
||||
#include <QFileInfo>
|
||||
#include <QHash>
|
||||
#include <QLatin1String>
|
||||
#include <QList>
|
||||
#include <QMap>
|
||||
#include <QObject>
|
||||
#include <QQueue>
|
||||
#include <QSet>
|
||||
#include <QSqlDatabase>
|
||||
#include <QString>
|
||||
@ -18,13 +19,23 @@
|
||||
#include <QThread>
|
||||
#include <QUrl>
|
||||
#include <QVector>
|
||||
#include <QtGlobal>
|
||||
|
||||
#include <atomic>
|
||||
#include <cstddef>
|
||||
#include <list>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
using namespace Qt::Literals::StringLiterals;
|
||||
|
||||
class Database;
|
||||
class DocumentReader;
|
||||
class QFileSystemWatcher;
|
||||
class QSqlError;
|
||||
class QSqlQuery;
|
||||
class QTextStream;
|
||||
class QTimer;
|
||||
|
||||
@ -39,14 +50,16 @@ static const int LOCALDOCS_VERSION = 3;
|
||||
|
||||
struct DocumentInfo
|
||||
{
|
||||
using key_type = std::pair<int, QString>;
|
||||
|
||||
int folder;
|
||||
QFileInfo doc;
|
||||
int currentPage = 0;
|
||||
size_t currentPosition = 0;
|
||||
QFileInfo file;
|
||||
bool currentlyProcessing = false;
|
||||
bool isPdf() const {
|
||||
return doc.suffix().compare(u"pdf"_s, Qt::CaseInsensitive) == 0;
|
||||
}
|
||||
|
||||
key_type key() const { return {folder, file.canonicalFilePath()}; } // for comparison
|
||||
|
||||
bool isPdf () const { return !file.suffix().compare("pdf"_L1, Qt::CaseInsensitive); }
|
||||
bool isDocx() const { return !file.suffix().compare("docx"_L1, Qt::CaseInsensitive); }
|
||||
};
|
||||
|
||||
struct ResultInfo {
|
||||
@ -141,6 +154,36 @@ struct CollectionItem {
|
||||
};
|
||||
Q_DECLARE_METATYPE(CollectionItem)
|
||||
|
||||
class ChunkStreamer {
|
||||
public:
|
||||
enum class Status { DOC_COMPLETE, INTERRUPTED, ERROR, BINARY_SEEN };
|
||||
|
||||
explicit ChunkStreamer(Database *database);
|
||||
~ChunkStreamer();
|
||||
|
||||
void setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel, const QString &title,
|
||||
const QString &author, const QString &subject, const QString &keywords);
|
||||
|
||||
Status step();
|
||||
|
||||
private:
|
||||
Database *m_database;
|
||||
std::optional<DocumentInfo::key_type> m_docKey;
|
||||
std::unique_ptr<DocumentReader> m_reader; // may be invalid, always compare key first
|
||||
int m_documentId;
|
||||
QString m_embeddingModel;
|
||||
QString m_title;
|
||||
QString m_author;
|
||||
QString m_subject;
|
||||
QString m_keywords;
|
||||
bool m_atStart;
|
||||
|
||||
// working state
|
||||
QString m_chunk; // has a trailing space for convenience
|
||||
int m_nChunkWords = 0;
|
||||
int m_page = 0;
|
||||
};
|
||||
|
||||
class Database : public QObject
|
||||
{
|
||||
Q_OBJECT
|
||||
@ -152,6 +195,7 @@ public:
|
||||
|
||||
public Q_SLOTS:
|
||||
void start();
|
||||
bool scanQueueInterrupted() const;
|
||||
void scanQueueBatch();
|
||||
void scanDocuments(int folder_id, const QString &folder_path);
|
||||
void forceIndexing(const QString &collection, const QString &embedding_model);
|
||||
@ -194,14 +238,12 @@ private:
|
||||
void appendChunk(const EmbeddingChunk &chunk);
|
||||
void sendChunkList();
|
||||
void updateFolderToIndex(int folder_id, size_t countForFolder, bool sendChunks = true);
|
||||
void handleDocumentError(const QString &errorMessage,
|
||||
int document_id, const QString &document_path, const QSqlError &error);
|
||||
size_t countOfDocuments(int folder_id) const;
|
||||
size_t countOfBytes(int folder_id) const;
|
||||
DocumentInfo dequeueDocument();
|
||||
void removeFolderFromDocumentQueue(int folder_id);
|
||||
void enqueueDocumentInternal(const DocumentInfo &info, bool prepend = false);
|
||||
void enqueueDocuments(int folder_id, const QVector<DocumentInfo> &infos);
|
||||
void enqueueDocumentInternal(DocumentInfo &&info, bool prepend = false);
|
||||
void enqueueDocuments(int folder_id, std::list<DocumentInfo> &&infos);
|
||||
void scanQueue();
|
||||
bool cleanDB();
|
||||
void addFolderToWatch(const QString &path);
|
||||
@ -240,8 +282,9 @@ private:
|
||||
QSqlDatabase m_db;
|
||||
int m_chunkSize;
|
||||
QStringList m_scannedFileExtensions;
|
||||
QTimer *m_scanTimer;
|
||||
QMap<int, QQueue<DocumentInfo>> m_docsToScan;
|
||||
QTimer *m_scanIntervalTimer;
|
||||
QElapsedTimer m_scanDurationTimer;
|
||||
std::map<int, std::list<DocumentInfo>> m_docsToScan;
|
||||
QList<ResultInfo> m_retrieve;
|
||||
QThread m_dbThread;
|
||||
QFileSystemWatcher *m_watcher;
|
||||
@ -250,6 +293,9 @@ private:
|
||||
QVector<EmbeddingChunk> m_chunkList;
|
||||
QHash<int, CollectionItem> m_collectionMap; // used only for tracking indexing/embedding progress
|
||||
std::atomic<bool> m_databaseValid;
|
||||
ChunkStreamer m_chunkStreamer;
|
||||
|
||||
friend class ChunkStreamer;
|
||||
};
|
||||
|
||||
#endif // DATABASE_H
|
||||
|
@ -55,7 +55,7 @@ static const QVariantMap basicDefaults {
|
||||
{ "localdocs/chunkSize", 512 },
|
||||
{ "localdocs/retrievalSize", 3 },
|
||||
{ "localdocs/showReferences", true },
|
||||
{ "localdocs/fileExtensions", QStringList { "txt", "pdf", "md", "rst" } },
|
||||
{ "localdocs/fileExtensions", QStringList { "docx", "pdf", "txt", "md", "rst" } },
|
||||
{ "localdocs/useRemoteEmbed", false },
|
||||
{ "localdocs/nomicAPIKey", "" },
|
||||
{ "localdocs/embedDevice", "Auto" },
|
||||
|
@ -3,8 +3,8 @@
|
||||
#include "chat.h"
|
||||
#include "modellist.h"
|
||||
#include "mysettings.h"
|
||||
#include "utils.h"
|
||||
|
||||
#include <fmt/base.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include <QByteArray>
|
||||
@ -25,9 +25,9 @@
|
||||
#include <QVariant>
|
||||
#include <Qt>
|
||||
#include <QtCborCommon>
|
||||
#include <QtGlobal>
|
||||
#include <QtLogging>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <optional>
|
||||
@ -37,26 +37,12 @@
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
|
||||
namespace ranges = std::ranges;
|
||||
using namespace std::string_literals;
|
||||
using namespace Qt::Literals::StringLiterals;
|
||||
|
||||
//#define DEBUG
|
||||
|
||||
|
||||
#define MAKE_FORMATTER(type, conversion) \
|
||||
template <> \
|
||||
struct fmt::formatter<type, char>: fmt::formatter<std::string, char> { \
|
||||
template <typename FmtContext> \
|
||||
FmtContext::iterator format(const type &value, FmtContext &ctx) const \
|
||||
{ \
|
||||
return formatter<std::string, char>::format(conversion, ctx); \
|
||||
} \
|
||||
}
|
||||
|
||||
MAKE_FORMATTER(QString, value.toStdString() );
|
||||
MAKE_FORMATTER(QVariant, value.toString().toStdString());
|
||||
|
||||
namespace {
|
||||
|
||||
class InvalidRequestError: public std::invalid_argument {
|
||||
|
25
gpt4all-chat/src/utils.h
Normal file
25
gpt4all-chat/src/utils.h
Normal file
@ -0,0 +1,25 @@
|
||||
#pragma once
|
||||
|
||||
#include <fmt/base.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include <QString>
|
||||
#include <QVariant>
|
||||
|
||||
#include <string>
|
||||
|
||||
|
||||
// fmtlib formatters for QString and QVariant
|
||||
|
||||
#define MAKE_FORMATTER(type, conversion) \
|
||||
template <> \
|
||||
struct fmt::formatter<type, char>: fmt::formatter<std::string, char> { \
|
||||
template <typename FmtContext> \
|
||||
FmtContext::iterator format(const type &value, FmtContext &ctx) const \
|
||||
{ \
|
||||
return formatter<std::string, char>::format(conversion, ctx); \
|
||||
} \
|
||||
}
|
||||
|
||||
MAKE_FORMATTER(QString, value.toStdString() );
|
||||
MAKE_FORMATTER(QVariant, value.toString().toStdString());
|
Loading…
Reference in New Issue
Block a user