mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2024-10-01 01:06:10 -04:00
Cleanup of the database, better chunking, better matching.
This commit is contained in:
parent
5f533e76a1
commit
54fc980cb5
@ -113,20 +113,20 @@ QStringList generateGrams(const QString &input, int N)
|
|||||||
for (int i = 0; i < words.size() - (N - 1); ++i) {
|
for (int i = 0; i < words.size() - (N - 1); ++i) {
|
||||||
QStringList currentNgram;
|
QStringList currentNgram;
|
||||||
for (int j = 0; j < N; ++j) {
|
for (int j = 0; j < N; ++j) {
|
||||||
currentNgram.append(words[i + j]);
|
currentNgram.append("\"" + words[i + j] + "\"");
|
||||||
}
|
}
|
||||||
ngrams.append("\"" + currentNgram.join(" ") + "\"");
|
ngrams.append("NEAR(" + currentNgram.join(" ") + ", " + QString::number(N) + ")");
|
||||||
}
|
}
|
||||||
return ngrams;
|
return ngrams;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool selectChunk(QSqlQuery &q, const QList<QString> &collection_names, const QString &chunk_text)
|
bool selectChunk(QSqlQuery &q, const QList<QString> &collection_names, const QString &chunk_text)
|
||||||
{
|
{
|
||||||
for (int N = 5; N > 1; N--) {
|
const int N_WORDS = chunk_text.split(QRegularExpression("\\s+")).size();
|
||||||
|
for (int N = N_WORDS; N > 2; N--) {
|
||||||
// first try trigrams
|
// first try trigrams
|
||||||
QList<QString> text = generateGrams(chunk_text, N);
|
QList<QString> text = generateGrams(chunk_text, N);
|
||||||
QString orText = text.join(" OR ");
|
QString orText = text.join(" OR ");
|
||||||
qDebug() << "before" << chunk_text << "after" << orText;
|
|
||||||
const QString collection_names_str = collection_names.join("', '");
|
const QString collection_names_str = collection_names.join("', '");
|
||||||
const QString formatted_query = SELECT_SQL.arg("'" + collection_names_str + "'");
|
const QString formatted_query = SELECT_SQL.arg("'" + collection_names_str + "'");
|
||||||
if (!q.prepare(formatted_query))
|
if (!q.prepare(formatted_query))
|
||||||
@ -135,6 +135,9 @@ bool selectChunk(QSqlQuery &q, const QList<QString> &collection_names, const QSt
|
|||||||
bool success = q.exec();
|
bool success = q.exec();
|
||||||
if (!success) return false;
|
if (!success) return false;
|
||||||
if (q.next()) {
|
if (q.next()) {
|
||||||
|
#if defined(DEBUG)
|
||||||
|
qDebug() << "hit on" << N << "before" << chunk_text << "after" << orText;
|
||||||
|
#endif
|
||||||
q.previous();
|
q.previous();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -175,6 +178,10 @@ const auto SELECT_COLLECTIONS_FROM_FOLDER_SQL = QLatin1String(R"(
|
|||||||
select collection_name from collections where folder_id = ?;
|
select collection_name from collections where folder_id = ?;
|
||||||
)");
|
)");
|
||||||
|
|
||||||
|
const auto SELECT_COLLECTIONS_SQL = QLatin1String(R"(
|
||||||
|
select collection_name, folder_id from collections;
|
||||||
|
)");
|
||||||
|
|
||||||
bool addCollection(QSqlQuery &q, const QString &collection_name, int folder_id)
|
bool addCollection(QSqlQuery &q, const QString &collection_name, int folder_id)
|
||||||
{
|
{
|
||||||
if (!q.prepare(INSERT_COLLECTION_SQL))
|
if (!q.prepare(INSERT_COLLECTION_SQL))
|
||||||
@ -215,6 +222,16 @@ bool selectCollectionsFromFolder(QSqlQuery &q, int folder_id, QList<QString> *co
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool selectAllFromCollections(QSqlQuery &q, QList<QPair<QString, int>> *collections) {
|
||||||
|
if (!q.prepare(SELECT_COLLECTIONS_SQL))
|
||||||
|
return false;
|
||||||
|
if (!q.exec())
|
||||||
|
return false;
|
||||||
|
while (q.next())
|
||||||
|
collections->append(qMakePair(q.value(0).toString(), q.value(1).toInt()));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
const auto INSERT_FOLDERS_SQL = QLatin1String(R"(
|
const auto INSERT_FOLDERS_SQL = QLatin1String(R"(
|
||||||
insert into folders(folder_path) values(?);
|
insert into folders(folder_path) values(?);
|
||||||
)");
|
)");
|
||||||
@ -223,10 +240,14 @@ const auto DELETE_FOLDERS_SQL = QLatin1String(R"(
|
|||||||
delete from folders where id = ?;
|
delete from folders where id = ?;
|
||||||
)");
|
)");
|
||||||
|
|
||||||
const auto SELECT_FOLDERS_SQL = QLatin1String(R"(
|
const auto SELECT_FOLDERS_FROM_PATH_SQL = QLatin1String(R"(
|
||||||
select id from folders where folder_path = ?;
|
select id from folders where folder_path = ?;
|
||||||
)");
|
)");
|
||||||
|
|
||||||
|
const auto SELECT_FOLDERS_FROM_ID_SQL = QLatin1String(R"(
|
||||||
|
select folder_path from folders where id = ?;
|
||||||
|
)");
|
||||||
|
|
||||||
const auto FOLDERS_SQL = QLatin1String(R"(
|
const auto FOLDERS_SQL = QLatin1String(R"(
|
||||||
create table folders(id integer primary key, folder_path varchar unique);
|
create table folders(id integer primary key, folder_path varchar unique);
|
||||||
)");
|
)");
|
||||||
@ -250,7 +271,7 @@ bool removeFolderFromDB(QSqlQuery &q, int folder_id) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool selectFolder(QSqlQuery &q, const QString &folder_path, int *id) {
|
bool selectFolder(QSqlQuery &q, const QString &folder_path, int *id) {
|
||||||
if (!q.prepare(SELECT_FOLDERS_SQL))
|
if (!q.prepare(SELECT_FOLDERS_FROM_PATH_SQL))
|
||||||
return false;
|
return false;
|
||||||
q.addBindValue(folder_path);
|
q.addBindValue(folder_path);
|
||||||
if (!q.exec())
|
if (!q.exec())
|
||||||
@ -261,6 +282,18 @@ bool selectFolder(QSqlQuery &q, const QString &folder_path, int *id) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool selectFolder(QSqlQuery &q, int id, QString *folder_path) {
|
||||||
|
if (!q.prepare(SELECT_FOLDERS_FROM_ID_SQL))
|
||||||
|
return false;
|
||||||
|
q.addBindValue(id);
|
||||||
|
if (!q.exec())
|
||||||
|
return false;
|
||||||
|
Q_ASSERT(q.size() < 2);
|
||||||
|
if (q.next())
|
||||||
|
*folder_path = q.value(0).toString();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
const auto INSERT_DOCUMENTS_SQL = QLatin1String(R"(
|
const auto INSERT_DOCUMENTS_SQL = QLatin1String(R"(
|
||||||
insert into documents(folder_id, document_time, document_path) values(?, ?, ?);
|
insert into documents(folder_id, document_time, document_path) values(?, ?, ?);
|
||||||
)");
|
)");
|
||||||
@ -285,6 +318,10 @@ const auto SELECT_DOCUMENTS_SQL = QLatin1String(R"(
|
|||||||
select id from documents where folder_id = ?;
|
select id from documents where folder_id = ?;
|
||||||
)");
|
)");
|
||||||
|
|
||||||
|
const auto SELECT_ALL_DOCUMENTS_SQL = QLatin1String(R"(
|
||||||
|
select id, document_path from documents;
|
||||||
|
)");
|
||||||
|
|
||||||
bool addDocument(QSqlQuery &q, int folder_id, qint64 document_time, const QString &document_path, int *document_id)
|
bool addDocument(QSqlQuery &q, int folder_id, qint64 document_time, const QString &document_path, int *document_id)
|
||||||
{
|
{
|
||||||
if (!q.prepare(INSERT_DOCUMENTS_SQL))
|
if (!q.prepare(INSERT_DOCUMENTS_SQL))
|
||||||
@ -441,22 +478,30 @@ void Database::handleDocumentErrorAndScheduleNext(const QString &errorMessage,
|
|||||||
|
|
||||||
void Database::chunkStream(QTextStream &stream, int document_id)
|
void Database::chunkStream(QTextStream &stream, int document_id)
|
||||||
{
|
{
|
||||||
QString text = stream.readAll();
|
const int chunkSize = 256;
|
||||||
int chunkSize = 256;
|
|
||||||
int overlap = 25;
|
|
||||||
int chunk_id = 0;
|
int chunk_id = 0;
|
||||||
|
int charCount = 0;
|
||||||
|
QList<QString> words;
|
||||||
|
|
||||||
for (int i = 0; i + chunkSize < text.length(); i += (chunkSize - overlap)) {
|
while (!stream.atEnd()) {
|
||||||
QString chunk = text.mid(i, chunkSize);
|
QString word;
|
||||||
QSqlQuery q;
|
stream >> word;
|
||||||
if (!addChunk(q,
|
charCount += word.length();
|
||||||
document_id,
|
words.append(word);
|
||||||
++chunk_id,
|
if (charCount + words.size() - 1 >= chunkSize || stream.atEnd()) {
|
||||||
chunk,
|
const QString chunk = words.join(" ");
|
||||||
0 /*embedding_id*/,
|
QSqlQuery q;
|
||||||
QString() /*embedding_path*/
|
if (!addChunk(q,
|
||||||
)) {
|
document_id,
|
||||||
qWarning() << "ERROR: Could not insert chunk into db" << q.lastError();
|
++chunk_id,
|
||||||
|
chunk,
|
||||||
|
0 /*embedding_id*/,
|
||||||
|
QString() /*embedding_path*/
|
||||||
|
)) {
|
||||||
|
qWarning() << "ERROR: Could not insert chunk into db" << q.lastError();
|
||||||
|
}
|
||||||
|
words.clear();
|
||||||
|
charCount = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -466,7 +511,18 @@ void Database::scanQueue()
|
|||||||
if (m_docsToScan.isEmpty())
|
if (m_docsToScan.isEmpty())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
const DocumentInfo info = m_docsToScan.dequeue();
|
DocumentInfo info = m_docsToScan.dequeue();
|
||||||
|
|
||||||
|
// Update info
|
||||||
|
info.doc.stat();
|
||||||
|
|
||||||
|
// If the doc has since been deleted or no longer readable, then we schedule more work and return
|
||||||
|
// leaving the cleanup for the cleanup handler
|
||||||
|
if (!info.doc.exists() || !info.doc.isReadable()) {
|
||||||
|
if (!m_docsToScan.isEmpty()) QTimer::singleShot(0, this, &Database::scanQueue);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const int folder_id = info.folder;
|
const int folder_id = info.folder;
|
||||||
const qint64 document_time = info.doc.fileTime(QFile::FileModificationTime).toMSecsSinceEpoch();
|
const qint64 document_time = info.doc.fileTime(QFile::FileModificationTime).toMSecsSinceEpoch();
|
||||||
const QString document_path = info.doc.canonicalFilePath();
|
const QString document_path = info.doc.canonicalFilePath();
|
||||||
@ -565,7 +621,6 @@ void Database::scanDocuments(int folder_id, const QString &folder_path)
|
|||||||
while (it.hasNext()) {
|
while (it.hasNext()) {
|
||||||
it.next();
|
it.next();
|
||||||
QFileInfo fileInfo = it.fileInfo();
|
QFileInfo fileInfo = it.fileInfo();
|
||||||
fileInfo.setCaching(false);
|
|
||||||
if (fileInfo.isDir()) {
|
if (fileInfo.isDir()) {
|
||||||
addFolderToWatch(fileInfo.canonicalFilePath());
|
addFolderToWatch(fileInfo.canonicalFilePath());
|
||||||
continue;
|
continue;
|
||||||
@ -663,7 +718,13 @@ void Database::removeFolder(const QString &collection, const QString &path)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
removeFolderInternal(collection, folder_id, path);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Database::removeFolderInternal(const QString &collection, int folder_id, const QString &path)
|
||||||
|
{
|
||||||
// Determine if the folder is used by more than one collection
|
// Determine if the folder is used by more than one collection
|
||||||
|
QSqlQuery q;
|
||||||
QList<QString> collections;
|
QList<QString> collections;
|
||||||
if (!selectCollectionsFromFolder(q, folder_id, &collections)) {
|
if (!selectCollectionsFromFolder(q, folder_id, &collections)) {
|
||||||
qWarning() << "ERROR: Cannot select collections from folder" << folder_id << q.lastError();
|
qWarning() << "ERROR: Cannot select collections from folder" << folder_id << q.lastError();
|
||||||
@ -771,6 +832,73 @@ void Database::retrieveFromDB(const QList<QString> &collections, const QString &
|
|||||||
emit retrieveResult(results);
|
emit retrieveResult(results);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Database::cleanDB()
|
||||||
|
{
|
||||||
|
#if defined(DEBUG)
|
||||||
|
qDebug() << "cleanDB";
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Scan all folders in db to make sure they still exist
|
||||||
|
QSqlQuery q;
|
||||||
|
QList<QPair<QString, int>> collections;
|
||||||
|
if (!selectAllFromCollections(q, &collections)) {
|
||||||
|
qWarning() << "ERROR: Cannot select collections" << q.lastError();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto pair : collections) {
|
||||||
|
// Find the path for the folder
|
||||||
|
QString collection = pair.first;
|
||||||
|
int folder_id = pair.second;
|
||||||
|
QString folder_path;
|
||||||
|
if (!selectFolder(q, folder_id, &folder_path)) {
|
||||||
|
qWarning() << "ERROR: Cannot select folder from id" << folder_id << q.lastError();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
QFileInfo info(folder_path);
|
||||||
|
if (!info.exists() || !info.isReadable()) {
|
||||||
|
#if defined(DEBUG)
|
||||||
|
qDebug() << "clean db removing folder" << folder_id << folder_path;
|
||||||
|
#endif
|
||||||
|
removeFolderInternal(collection, folder_id, folder_path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scan all documents in db to make sure they still exist
|
||||||
|
if (!q.prepare(SELECT_ALL_DOCUMENTS_SQL)) {
|
||||||
|
qWarning() << "ERROR: Cannot prepare sql for select all documents" << q.lastError();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!q.exec()) {
|
||||||
|
qWarning() << "ERROR: Cannot exec sql for select all documents" << q.lastError();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (q.next()) {
|
||||||
|
int document_id = q.value(0).toInt();
|
||||||
|
QString document_path = q.value(1).toString();
|
||||||
|
QFileInfo info(document_path);
|
||||||
|
if (info.exists() && info.isReadable())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
#if defined(DEBUG)
|
||||||
|
qDebug() << "clean db removing document" << document_id << document_path;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Remove all chunks and documents that either don't exist or have become unreadable
|
||||||
|
QSqlQuery query;
|
||||||
|
if (!removeChunksByDocumentId(query, document_id)) {
|
||||||
|
qWarning() << "ERROR: Cannot remove chunks of document_id" << document_id << query.lastError();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!removeDocument(query, document_id)) {
|
||||||
|
qWarning() << "ERROR: Cannot remove document_id" << document_id << query.lastError();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void Database::directoryChanged(const QString &path)
|
void Database::directoryChanged(const QString &path)
|
||||||
{
|
{
|
||||||
#if defined(DEBUG)
|
#if defined(DEBUG)
|
||||||
@ -794,6 +922,9 @@ void Database::directoryChanged(const QString &path)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clean the database
|
||||||
|
cleanDB();
|
||||||
|
|
||||||
// Rescan the documents associated with the folder
|
// Rescan the documents associated with the folder
|
||||||
scanDocuments(folder_id, path);
|
scanDocuments(folder_id, path);
|
||||||
}
|
}
|
||||||
|
@ -26,6 +26,7 @@ public Q_SLOTS:
|
|||||||
void addFolder(const QString &collection, const QString &path);
|
void addFolder(const QString &collection, const QString &path);
|
||||||
void removeFolder(const QString &collection, const QString &path);
|
void removeFolder(const QString &collection, const QString &path);
|
||||||
void retrieveFromDB(const QList<QString> &collections, const QString &text);
|
void retrieveFromDB(const QList<QString> &collections, const QString &text);
|
||||||
|
void cleanDB();
|
||||||
|
|
||||||
Q_SIGNALS:
|
Q_SIGNALS:
|
||||||
void docsToScanChanged();
|
void docsToScanChanged();
|
||||||
@ -38,6 +39,7 @@ private Q_SLOTS:
|
|||||||
bool removeFolderFromWatch(const QString &path);
|
bool removeFolderFromWatch(const QString &path);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
void removeFolderInternal(const QString &collection, int folder_id, const QString &path);
|
||||||
void chunkStream(QTextStream &stream, int document_id);
|
void chunkStream(QTextStream &stream, int document_id);
|
||||||
void handleDocumentErrorAndScheduleNext(const QString &errorMessage,
|
void handleDocumentErrorAndScheduleNext(const QString &errorMessage,
|
||||||
int document_id, const QString &document_path, const QSqlError &error);
|
int document_id, const QString &document_path, const QSqlError &error);
|
||||||
|
Loading…
Reference in New Issue
Block a user