2023-05-22 22:13:42 -04:00
|
|
|
#ifndef DATABASE_H
|
|
|
|
#define DATABASE_H
|
|
|
|
|
|
|
|
#include <QObject>
|
|
|
|
#include <QtSql>
|
|
|
|
#include <QQueue>
|
|
|
|
#include <QFileInfo>
|
|
|
|
#include <QThread>
|
|
|
|
#include <QFileSystemWatcher>
|
|
|
|
|
2023-10-24 12:13:32 -04:00
|
|
|
class Embeddings;
|
|
|
|
class EmbeddingLLM;
|
2023-05-22 22:13:42 -04:00
|
|
|
struct DocumentInfo
|
|
|
|
{
|
|
|
|
int folder;
|
|
|
|
QFileInfo doc;
|
2023-10-24 12:13:32 -04:00
|
|
|
int currentPage = 0;
|
|
|
|
size_t currentPosition = 0;
|
|
|
|
bool currentlyProcessing = false;
|
|
|
|
bool isPdf() const {
|
|
|
|
return doc.suffix() == QLatin1String("pdf");
|
|
|
|
}
|
2023-05-22 22:13:42 -04:00
|
|
|
};
|
|
|
|
|
2023-05-24 14:49:43 -04:00
|
|
|
struct ResultInfo {
|
|
|
|
QString file; // [Required] The name of the file, but not the full path
|
|
|
|
QString title; // [Optional] The title of the document
|
|
|
|
QString author; // [Optional] The author of the document
|
|
|
|
QString date; // [Required] The creation or the last modification date whichever is latest
|
|
|
|
QString text; // [Required] The text actually used in the augmented context
|
|
|
|
int page = -1; // [Optional] The page where the text was found
|
|
|
|
int from = -1; // [Optional] The line number where the text begins
|
|
|
|
int to = -1; // [Optional] The line number where the text ends
|
|
|
|
};
|
|
|
|
|
2023-05-22 22:13:42 -04:00
|
|
|
struct CollectionItem {
|
|
|
|
QString collection;
|
|
|
|
QString folder_path;
|
|
|
|
int folder_id = -1;
|
2023-06-03 10:08:59 -04:00
|
|
|
bool installed = false;
|
2023-10-24 12:13:32 -04:00
|
|
|
bool indexing = false;
|
|
|
|
int currentDocsToIndex = 0;
|
|
|
|
int totalDocsToIndex = 0;
|
|
|
|
size_t currentBytesToIndex = 0;
|
|
|
|
size_t totalBytesToIndex = 0;
|
2023-05-22 22:13:42 -04:00
|
|
|
};
|
|
|
|
Q_DECLARE_METATYPE(CollectionItem)
|
|
|
|
|
|
|
|
class Database : public QObject
|
|
|
|
{
|
|
|
|
Q_OBJECT
|
|
|
|
public:
|
2023-05-23 20:26:31 -04:00
|
|
|
Database(int chunkSize);
|
2023-10-24 12:13:32 -04:00
|
|
|
virtual ~Database();
|
2023-05-22 22:13:42 -04:00
|
|
|
|
|
|
|
public Q_SLOTS:
|
|
|
|
void scanQueue();
|
|
|
|
void scanDocuments(int folder_id, const QString &folder_path);
|
|
|
|
void addFolder(const QString &collection, const QString &path);
|
|
|
|
void removeFolder(const QString &collection, const QString &path);
|
2023-06-01 14:13:12 -04:00
|
|
|
void retrieveFromDB(const QList<QString> &collections, const QString &text, int retrievalSize, QList<ResultInfo> *results);
|
2023-05-22 22:13:42 -04:00
|
|
|
void cleanDB();
|
2023-05-23 20:26:31 -04:00
|
|
|
void changeChunkSize(int chunkSize);
|
2023-05-22 22:13:42 -04:00
|
|
|
|
|
|
|
Q_SIGNALS:
|
|
|
|
void docsToScanChanged();
|
2023-10-24 12:13:32 -04:00
|
|
|
void updateInstalled(int folder_id, bool b);
|
|
|
|
void updateIndexing(int folder_id, bool b);
|
|
|
|
void updateCurrentDocsToIndex(int folder_id, size_t currentDocsToIndex);
|
|
|
|
void updateTotalDocsToIndex(int folder_id, size_t totalDocsToIndex);
|
|
|
|
void subtractCurrentBytesToIndex(int folder_id, size_t subtractedBytes);
|
|
|
|
void updateCurrentBytesToIndex(int folder_id, size_t currentBytesToIndex);
|
|
|
|
void updateTotalBytesToIndex(int folder_id, size_t totalBytesToIndex);
|
|
|
|
void addCollectionItem(const CollectionItem &item);
|
|
|
|
void removeFolderById(int folder_id);
|
|
|
|
void removeCollectionItem(const QString &collectionName);
|
2023-05-22 22:13:42 -04:00
|
|
|
void collectionListUpdated(const QList<CollectionItem> &collectionList);
|
|
|
|
|
|
|
|
private Q_SLOTS:
|
|
|
|
void start();
|
|
|
|
void directoryChanged(const QString &path);
|
|
|
|
bool addFolderToWatch(const QString &path);
|
|
|
|
bool removeFolderFromWatch(const QString &path);
|
|
|
|
void addCurrentFolders();
|
|
|
|
|
|
|
|
private:
|
|
|
|
void removeFolderInternal(const QString &collection, int folder_id, const QString &path);
|
2023-10-24 12:13:32 -04:00
|
|
|
size_t chunkStream(QTextStream &stream, int document_id, const QString &file,
|
|
|
|
const QString &title, const QString &author, const QString &subject, const QString &keywords, int page,
|
|
|
|
int maxChunks = -1);
|
|
|
|
void removeEmbeddingsByDocumentId(int document_id);
|
|
|
|
void scheduleNext(int folder_id, size_t countForFolder);
|
|
|
|
void handleDocumentError(const QString &errorMessage,
|
2023-05-22 22:13:42 -04:00
|
|
|
int document_id, const QString &document_path, const QSqlError &error);
|
2023-10-24 12:13:32 -04:00
|
|
|
size_t countOfDocuments(int folder_id) const;
|
|
|
|
size_t countOfBytes(int folder_id) const;
|
|
|
|
DocumentInfo dequeueDocument();
|
|
|
|
void removeFolderFromDocumentQueue(int folder_id);
|
|
|
|
void enqueueDocumentInternal(const DocumentInfo &info, bool prepend = false);
|
|
|
|
void enqueueDocuments(int folder_id, const QVector<DocumentInfo> &infos);
|
2023-05-22 22:13:42 -04:00
|
|
|
|
|
|
|
private:
|
2023-05-23 20:26:31 -04:00
|
|
|
int m_chunkSize;
|
2023-10-24 12:13:32 -04:00
|
|
|
QMap<int, QQueue<DocumentInfo>> m_docsToScan;
|
2023-05-24 14:49:43 -04:00
|
|
|
QList<ResultInfo> m_retrieve;
|
2023-05-22 22:13:42 -04:00
|
|
|
QThread m_dbThread;
|
|
|
|
QFileSystemWatcher *m_watcher;
|
2023-10-24 12:13:32 -04:00
|
|
|
EmbeddingLLM *m_embLLM;
|
|
|
|
Embeddings *m_embeddings;
|
2023-05-22 22:13:42 -04:00
|
|
|
};
|
|
|
|
|
|
|
|
#endif // DATABASE_H
|