2023-05-22 22:13:42 -04:00
|
|
|
#ifndef DATABASE_H
|
|
|
|
#define DATABASE_H
|
|
|
|
|
2024-06-06 11:59:28 -04:00
|
|
|
#include "embllm.h" // IWYU pragma: keep
|
|
|
|
|
2024-06-24 18:49:23 -04:00
|
|
|
#include <QDateTime>
|
2024-04-25 13:16:52 -04:00
|
|
|
#include <QFileInfo>
|
2024-06-24 18:49:23 -04:00
|
|
|
#include <QHash>
|
2024-06-04 14:47:11 -04:00
|
|
|
#include <QLatin1String>
|
|
|
|
#include <QList>
|
|
|
|
#include <QMap>
|
2023-05-22 22:13:42 -04:00
|
|
|
#include <QObject>
|
|
|
|
#include <QQueue>
|
2024-06-24 18:49:23 -04:00
|
|
|
#include <QSet>
|
|
|
|
#include <QSqlDatabase>
|
2024-06-04 14:47:11 -04:00
|
|
|
#include <QString>
|
2024-06-24 18:49:23 -04:00
|
|
|
#include <QStringList>
|
2023-05-22 22:13:42 -04:00
|
|
|
#include <QThread>
|
2024-06-26 14:48:02 -04:00
|
|
|
#include <QUrl>
|
2024-06-04 14:47:11 -04:00
|
|
|
#include <QVector>
|
2023-05-22 22:13:42 -04:00
|
|
|
|
2024-06-04 14:47:11 -04:00
|
|
|
#include <cstddef>
|
2024-01-22 12:36:01 -05:00
|
|
|
|
2024-06-24 18:49:23 -04:00
|
|
|
using namespace Qt::Literals::StringLiterals;
|
|
|
|
|
2024-06-04 14:47:11 -04:00
|
|
|
class QFileSystemWatcher;
|
|
|
|
class QSqlError;
|
|
|
|
class QTextStream;
|
2024-04-25 13:16:52 -04:00
|
|
|
class QTimer;
|
|
|
|
|
2024-06-24 18:49:23 -04:00
|
|
|
/* Version 0: GPT4All v2.4.3, full-text search
|
|
|
|
* Version 1: GPT4All v2.5.3, embeddings in hsnwlib
|
|
|
|
* Version 2: GPT4All v3.0.0, embeddings in sqlite */
|
|
|
|
|
|
|
|
// minimum supported version
|
|
|
|
static const int LOCALDOCS_MIN_VER = 1;
|
|
|
|
// current version
|
|
|
|
static const int LOCALDOCS_VERSION = 2;
|
|
|
|
|
2023-05-22 22:13:42 -04:00
|
|
|
struct DocumentInfo
|
|
|
|
{
|
|
|
|
int folder;
|
|
|
|
QFileInfo doc;
|
2023-10-24 12:13:32 -04:00
|
|
|
int currentPage = 0;
|
|
|
|
size_t currentPosition = 0;
|
|
|
|
bool currentlyProcessing = false;
|
|
|
|
bool isPdf() const {
|
2024-06-24 18:49:23 -04:00
|
|
|
return doc.suffix() == u"pdf"_s;
|
2023-10-24 12:13:32 -04:00
|
|
|
}
|
2023-05-22 22:13:42 -04:00
|
|
|
};
|
|
|
|
|
2023-05-24 14:49:43 -04:00
|
|
|
struct ResultInfo {
|
2024-06-26 14:48:02 -04:00
|
|
|
Q_GADGET
|
|
|
|
Q_PROPERTY(QString collection MEMBER collection)
|
|
|
|
Q_PROPERTY(QString path MEMBER path)
|
|
|
|
Q_PROPERTY(QString file MEMBER file)
|
|
|
|
Q_PROPERTY(QString title MEMBER title)
|
|
|
|
Q_PROPERTY(QString author MEMBER author)
|
|
|
|
Q_PROPERTY(QString date MEMBER date)
|
|
|
|
Q_PROPERTY(QString text MEMBER text)
|
|
|
|
Q_PROPERTY(int page MEMBER page)
|
|
|
|
Q_PROPERTY(int from MEMBER from)
|
|
|
|
Q_PROPERTY(int to MEMBER to)
|
|
|
|
Q_PROPERTY(QString fileUri READ fileUri STORED false)
|
|
|
|
|
|
|
|
public:
|
2024-06-24 18:49:23 -04:00
|
|
|
QString collection; // [Required] The name of the collection
|
|
|
|
QString path; // [Required] The full path
|
|
|
|
QString file; // [Required] The name of the file, but not the full path
|
|
|
|
QString title; // [Optional] The title of the document
|
|
|
|
QString author; // [Optional] The author of the document
|
|
|
|
QString date; // [Required] The creation or the last modification date whichever is latest
|
|
|
|
QString text; // [Required] The text actually used in the augmented context
|
|
|
|
int page = -1; // [Optional] The page where the text was found
|
|
|
|
int from = -1; // [Optional] The line number where the text begins
|
|
|
|
int to = -1; // [Optional] The line number where the text ends
|
|
|
|
|
2024-06-26 14:48:02 -04:00
|
|
|
QString fileUri() const {
|
|
|
|
// QUrl reserved chars that are not UNSAFE_PATH according to glib/gconvert.c
|
|
|
|
static const QByteArray s_exclude = "!$&'()*+,/:=@~"_ba;
|
|
|
|
|
|
|
|
Q_ASSERT(!QFileInfo(path).isRelative());
|
|
|
|
#ifdef Q_OS_WINDOWS
|
|
|
|
Q_ASSERT(!path.contains('\\')); // Qt normally uses forward slash as path separator
|
|
|
|
#endif
|
|
|
|
|
|
|
|
auto escaped = QString::fromUtf8(QUrl::toPercentEncoding(path, s_exclude));
|
|
|
|
if (escaped.front() != '/')
|
|
|
|
escaped = '/' + escaped;
|
|
|
|
return u"file://"_s + escaped;
|
|
|
|
}
|
|
|
|
|
2024-06-24 18:49:23 -04:00
|
|
|
bool operator==(const ResultInfo &other) const {
|
|
|
|
return file == other.file &&
|
|
|
|
title == other.title &&
|
|
|
|
author == other.author &&
|
|
|
|
date == other.date &&
|
|
|
|
text == other.text &&
|
|
|
|
page == other.page &&
|
|
|
|
from == other.from &&
|
|
|
|
to == other.to;
|
|
|
|
}
|
|
|
|
bool operator!=(const ResultInfo &other) const {
|
|
|
|
return !(*this == other);
|
|
|
|
}
|
2023-05-24 14:49:43 -04:00
|
|
|
};
|
|
|
|
|
2024-06-24 18:49:23 -04:00
|
|
|
Q_DECLARE_METATYPE(ResultInfo)
|
|
|
|
|
2023-05-22 22:13:42 -04:00
|
|
|
struct CollectionItem {
|
2024-06-24 18:49:23 -04:00
|
|
|
// -- Fields persisted to database --
|
|
|
|
|
|
|
|
int collection_id = -1;
|
|
|
|
int folder_id = -1;
|
2023-05-22 22:13:42 -04:00
|
|
|
QString collection;
|
|
|
|
QString folder_path;
|
2024-06-24 18:49:23 -04:00
|
|
|
QString embeddingModel;
|
|
|
|
|
|
|
|
// -- Transient fields --
|
|
|
|
|
2023-06-03 10:08:59 -04:00
|
|
|
bool installed = false;
|
2023-10-24 12:13:32 -04:00
|
|
|
bool indexing = false;
|
2024-06-24 18:49:23 -04:00
|
|
|
bool forceIndexing = false;
|
2024-01-22 12:36:01 -05:00
|
|
|
QString error;
|
2024-06-24 18:49:23 -04:00
|
|
|
|
|
|
|
// progress
|
2023-10-24 12:13:32 -04:00
|
|
|
int currentDocsToIndex = 0;
|
|
|
|
int totalDocsToIndex = 0;
|
|
|
|
size_t currentBytesToIndex = 0;
|
|
|
|
size_t totalBytesToIndex = 0;
|
2024-01-22 12:36:01 -05:00
|
|
|
size_t currentEmbeddingsToIndex = 0;
|
|
|
|
size_t totalEmbeddingsToIndex = 0;
|
2024-06-24 18:49:23 -04:00
|
|
|
|
|
|
|
// statistics
|
|
|
|
size_t totalDocs = 0;
|
|
|
|
size_t totalWords = 0;
|
|
|
|
size_t totalTokens = 0;
|
|
|
|
QDateTime startUpdate;
|
|
|
|
QDateTime lastUpdate;
|
|
|
|
QString fileCurrentlyProcessing;
|
2023-05-22 22:13:42 -04:00
|
|
|
};
|
|
|
|
Q_DECLARE_METATYPE(CollectionItem)
|
|
|
|
|
|
|
|
class Database : public QObject
|
|
|
|
{
|
|
|
|
Q_OBJECT
|
|
|
|
public:
|
2024-06-24 18:49:23 -04:00
|
|
|
Database(int chunkSize, QStringList extensions);
|
|
|
|
~Database() override;
|
|
|
|
|
|
|
|
bool isValid() const { return m_databaseValid; }
|
2023-05-22 22:13:42 -04:00
|
|
|
|
|
|
|
public Q_SLOTS:
|
2024-04-25 13:16:52 -04:00
|
|
|
void start();
|
2024-06-24 18:49:23 -04:00
|
|
|
void scanQueueBatch();
|
|
|
|
void scanDocuments(int folder_id, const QString &folder_path);
|
|
|
|
void forceIndexing(const QString &collection, const QString &embedding_model);
|
|
|
|
void forceRebuildFolder(const QString &path);
|
|
|
|
bool addFolder(const QString &collection, const QString &path, const QString &embedding_model);
|
2023-05-22 22:13:42 -04:00
|
|
|
void removeFolder(const QString &collection, const QString &path);
|
2023-06-01 14:13:12 -04:00
|
|
|
void retrieveFromDB(const QList<QString> &collections, const QString &text, int retrievalSize, QList<ResultInfo> *results);
|
2023-05-23 20:26:31 -04:00
|
|
|
void changeChunkSize(int chunkSize);
|
2024-06-24 18:49:23 -04:00
|
|
|
void changeFileExtensions(const QStringList &extensions);
|
2023-05-22 22:13:42 -04:00
|
|
|
|
|
|
|
Q_SIGNALS:
|
2024-06-24 18:49:23 -04:00
|
|
|
// Signals for the gui only
|
|
|
|
void requestUpdateGuiForCollectionItem(const CollectionItem &item);
|
|
|
|
void requestAddGuiCollectionItem(const CollectionItem &item);
|
|
|
|
void requestRemoveGuiFolderById(const QString &collection, int folder_id);
|
|
|
|
void requestGuiCollectionListUpdated(const QList<CollectionItem> &collectionList);
|
|
|
|
void databaseValidChanged();
|
2023-05-22 22:13:42 -04:00
|
|
|
|
|
|
|
private Q_SLOTS:
|
|
|
|
void directoryChanged(const QString &path);
|
2024-06-24 18:49:23 -04:00
|
|
|
void addCurrentFolders();
|
2024-01-22 12:36:01 -05:00
|
|
|
void handleEmbeddingsGenerated(const QVector<EmbeddingResult> &embeddings);
|
2024-06-24 18:49:23 -04:00
|
|
|
void handleErrorGenerated(const QVector<EmbeddingChunk> &chunks, const QString &error);
|
2023-05-22 22:13:42 -04:00
|
|
|
|
|
|
|
private:
|
2024-06-24 18:49:23 -04:00
|
|
|
void transaction();
|
|
|
|
void commit();
|
|
|
|
void rollback();
|
|
|
|
|
|
|
|
bool hasContent();
|
|
|
|
// not found -> 0, , exists and has content -> 1, error -> -1
|
|
|
|
int openDatabase(const QString &modelPath, bool create = true, int ver = LOCALDOCS_VERSION);
|
|
|
|
bool openLatestDb(const QString &modelPath, QList<CollectionItem> &oldCollections);
|
|
|
|
bool initDb(const QString &modelPath, const QList<CollectionItem> &oldCollections);
|
|
|
|
int checkAndAddFolderToDB(const QString &path);
|
|
|
|
bool removeFolderInternal(const QString &collection, int folder_id, const QString &path);
|
|
|
|
size_t chunkStream(QTextStream &stream, int folder_id, int document_id, const QString &embedding_model,
|
|
|
|
const QString &file, const QString &title, const QString &author, const QString &subject,
|
|
|
|
const QString &keywords, int page, int maxChunks = -1);
|
|
|
|
void appendChunk(const EmbeddingChunk &chunk);
|
|
|
|
void sendChunkList();
|
|
|
|
void updateFolderToIndex(int folder_id, size_t countForFolder, bool sendChunks = true);
|
2023-10-24 12:13:32 -04:00
|
|
|
void handleDocumentError(const QString &errorMessage,
|
2023-05-22 22:13:42 -04:00
|
|
|
int document_id, const QString &document_path, const QSqlError &error);
|
2023-10-24 12:13:32 -04:00
|
|
|
size_t countOfDocuments(int folder_id) const;
|
|
|
|
size_t countOfBytes(int folder_id) const;
|
|
|
|
DocumentInfo dequeueDocument();
|
|
|
|
void removeFolderFromDocumentQueue(int folder_id);
|
|
|
|
void enqueueDocumentInternal(const DocumentInfo &info, bool prepend = false);
|
|
|
|
void enqueueDocuments(int folder_id, const QVector<DocumentInfo> &infos);
|
2024-06-24 18:49:23 -04:00
|
|
|
void scanQueue();
|
|
|
|
bool cleanDB();
|
|
|
|
void addFolderToWatch(const QString &path);
|
|
|
|
void removeFolderFromWatch(const QString &path);
|
|
|
|
QList<int> searchEmbeddings(const std::vector<float> &query, const QList<QString> &collections, int nNeighbors);
|
|
|
|
|
|
|
|
void setStartUpdateTime(CollectionItem &item);
|
|
|
|
void setLastUpdateTime(CollectionItem &item);
|
|
|
|
|
|
|
|
CollectionItem guiCollectionItem(int folder_id) const;
|
|
|
|
void updateGuiForCollectionItem(const CollectionItem &item);
|
|
|
|
void addGuiCollectionItem(const CollectionItem &item);
|
|
|
|
void removeGuiFolderById(const QString &collection, int folder_id);
|
|
|
|
void guiCollectionListUpdated(const QList<CollectionItem> &collectionList);
|
|
|
|
void scheduleUncompletedEmbeddings();
|
|
|
|
void updateCollectionStatistics();
|
2023-05-22 22:13:42 -04:00
|
|
|
|
|
|
|
private:
|
2024-06-24 18:49:23 -04:00
|
|
|
QSqlDatabase m_db;
|
2023-05-23 20:26:31 -04:00
|
|
|
int m_chunkSize;
|
2024-06-24 18:49:23 -04:00
|
|
|
QStringList m_scannedFileExtensions;
|
2024-04-25 13:16:52 -04:00
|
|
|
QTimer *m_scanTimer;
|
2023-10-24 12:13:32 -04:00
|
|
|
QMap<int, QQueue<DocumentInfo>> m_docsToScan;
|
2023-05-24 14:49:43 -04:00
|
|
|
QList<ResultInfo> m_retrieve;
|
2023-05-22 22:13:42 -04:00
|
|
|
QThread m_dbThread;
|
|
|
|
QFileSystemWatcher *m_watcher;
|
2024-06-24 18:49:23 -04:00
|
|
|
QSet<QString> m_watchedPaths;
|
2023-10-24 12:13:32 -04:00
|
|
|
EmbeddingLLM *m_embLLM;
|
2024-06-24 18:49:23 -04:00
|
|
|
QVector<EmbeddingChunk> m_chunkList;
|
|
|
|
QHash<int, CollectionItem> m_collectionMap; // used only for tracking indexing/embedding progress
|
|
|
|
std::atomic<bool> m_databaseValid;
|
2023-05-22 22:13:42 -04:00
|
|
|
};
|
|
|
|
|
|
|
|
#endif // DATABASE_H
|