mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2024-10-01 01:06:10 -04:00
Cleanup of the database, better chunking, better matching.
This commit is contained in:
parent
5f533e76a1
commit
54fc980cb5
@ -113,20 +113,20 @@ QStringList generateGrams(const QString &input, int N)
|
||||
for (int i = 0; i < words.size() - (N - 1); ++i) {
|
||||
QStringList currentNgram;
|
||||
for (int j = 0; j < N; ++j) {
|
||||
currentNgram.append(words[i + j]);
|
||||
currentNgram.append("\"" + words[i + j] + "\"");
|
||||
}
|
||||
ngrams.append("\"" + currentNgram.join(" ") + "\"");
|
||||
ngrams.append("NEAR(" + currentNgram.join(" ") + ", " + QString::number(N) + ")");
|
||||
}
|
||||
return ngrams;
|
||||
}
|
||||
|
||||
bool selectChunk(QSqlQuery &q, const QList<QString> &collection_names, const QString &chunk_text)
|
||||
{
|
||||
for (int N = 5; N > 1; N--) {
|
||||
const int N_WORDS = chunk_text.split(QRegularExpression("\\s+")).size();
|
||||
for (int N = N_WORDS; N > 2; N--) {
|
||||
// first try trigrams
|
||||
QList<QString> text = generateGrams(chunk_text, N);
|
||||
QString orText = text.join(" OR ");
|
||||
qDebug() << "before" << chunk_text << "after" << orText;
|
||||
const QString collection_names_str = collection_names.join("', '");
|
||||
const QString formatted_query = SELECT_SQL.arg("'" + collection_names_str + "'");
|
||||
if (!q.prepare(formatted_query))
|
||||
@ -135,6 +135,9 @@ bool selectChunk(QSqlQuery &q, const QList<QString> &collection_names, const QSt
|
||||
bool success = q.exec();
|
||||
if (!success) return false;
|
||||
if (q.next()) {
|
||||
#if defined(DEBUG)
|
||||
qDebug() << "hit on" << N << "before" << chunk_text << "after" << orText;
|
||||
#endif
|
||||
q.previous();
|
||||
return true;
|
||||
}
|
||||
@ -175,6 +178,10 @@ const auto SELECT_COLLECTIONS_FROM_FOLDER_SQL = QLatin1String(R"(
|
||||
select collection_name from collections where folder_id = ?;
|
||||
)");
|
||||
|
||||
const auto SELECT_COLLECTIONS_SQL = QLatin1String(R"(
|
||||
select collection_name, folder_id from collections;
|
||||
)");
|
||||
|
||||
bool addCollection(QSqlQuery &q, const QString &collection_name, int folder_id)
|
||||
{
|
||||
if (!q.prepare(INSERT_COLLECTION_SQL))
|
||||
@ -215,6 +222,16 @@ bool selectCollectionsFromFolder(QSqlQuery &q, int folder_id, QList<QString> *co
|
||||
return true;
|
||||
}
|
||||
|
||||
bool selectAllFromCollections(QSqlQuery &q, QList<QPair<QString, int>> *collections) {
|
||||
if (!q.prepare(SELECT_COLLECTIONS_SQL))
|
||||
return false;
|
||||
if (!q.exec())
|
||||
return false;
|
||||
while (q.next())
|
||||
collections->append(qMakePair(q.value(0).toString(), q.value(1).toInt()));
|
||||
return true;
|
||||
}
|
||||
|
||||
const auto INSERT_FOLDERS_SQL = QLatin1String(R"(
|
||||
insert into folders(folder_path) values(?);
|
||||
)");
|
||||
@ -223,10 +240,14 @@ const auto DELETE_FOLDERS_SQL = QLatin1String(R"(
|
||||
delete from folders where id = ?;
|
||||
)");
|
||||
|
||||
const auto SELECT_FOLDERS_SQL = QLatin1String(R"(
|
||||
const auto SELECT_FOLDERS_FROM_PATH_SQL = QLatin1String(R"(
|
||||
select id from folders where folder_path = ?;
|
||||
)");
|
||||
|
||||
const auto SELECT_FOLDERS_FROM_ID_SQL = QLatin1String(R"(
|
||||
select folder_path from folders where id = ?;
|
||||
)");
|
||||
|
||||
const auto FOLDERS_SQL = QLatin1String(R"(
|
||||
create table folders(id integer primary key, folder_path varchar unique);
|
||||
)");
|
||||
@ -250,7 +271,7 @@ bool removeFolderFromDB(QSqlQuery &q, int folder_id) {
|
||||
}
|
||||
|
||||
bool selectFolder(QSqlQuery &q, const QString &folder_path, int *id) {
|
||||
if (!q.prepare(SELECT_FOLDERS_SQL))
|
||||
if (!q.prepare(SELECT_FOLDERS_FROM_PATH_SQL))
|
||||
return false;
|
||||
q.addBindValue(folder_path);
|
||||
if (!q.exec())
|
||||
@ -261,6 +282,18 @@ bool selectFolder(QSqlQuery &q, const QString &folder_path, int *id) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool selectFolder(QSqlQuery &q, int id, QString *folder_path) {
|
||||
if (!q.prepare(SELECT_FOLDERS_FROM_ID_SQL))
|
||||
return false;
|
||||
q.addBindValue(id);
|
||||
if (!q.exec())
|
||||
return false;
|
||||
Q_ASSERT(q.size() < 2);
|
||||
if (q.next())
|
||||
*folder_path = q.value(0).toString();
|
||||
return true;
|
||||
}
|
||||
|
||||
const auto INSERT_DOCUMENTS_SQL = QLatin1String(R"(
|
||||
insert into documents(folder_id, document_time, document_path) values(?, ?, ?);
|
||||
)");
|
||||
@ -285,6 +318,10 @@ const auto SELECT_DOCUMENTS_SQL = QLatin1String(R"(
|
||||
select id from documents where folder_id = ?;
|
||||
)");
|
||||
|
||||
const auto SELECT_ALL_DOCUMENTS_SQL = QLatin1String(R"(
|
||||
select id, document_path from documents;
|
||||
)");
|
||||
|
||||
bool addDocument(QSqlQuery &q, int folder_id, qint64 document_time, const QString &document_path, int *document_id)
|
||||
{
|
||||
if (!q.prepare(INSERT_DOCUMENTS_SQL))
|
||||
@ -441,22 +478,30 @@ void Database::handleDocumentErrorAndScheduleNext(const QString &errorMessage,
|
||||
|
||||
void Database::chunkStream(QTextStream &stream, int document_id)
|
||||
{
|
||||
QString text = stream.readAll();
|
||||
int chunkSize = 256;
|
||||
int overlap = 25;
|
||||
const int chunkSize = 256;
|
||||
int chunk_id = 0;
|
||||
int charCount = 0;
|
||||
QList<QString> words;
|
||||
|
||||
for (int i = 0; i + chunkSize < text.length(); i += (chunkSize - overlap)) {
|
||||
QString chunk = text.mid(i, chunkSize);
|
||||
QSqlQuery q;
|
||||
if (!addChunk(q,
|
||||
document_id,
|
||||
++chunk_id,
|
||||
chunk,
|
||||
0 /*embedding_id*/,
|
||||
QString() /*embedding_path*/
|
||||
)) {
|
||||
qWarning() << "ERROR: Could not insert chunk into db" << q.lastError();
|
||||
while (!stream.atEnd()) {
|
||||
QString word;
|
||||
stream >> word;
|
||||
charCount += word.length();
|
||||
words.append(word);
|
||||
if (charCount + words.size() - 1 >= chunkSize || stream.atEnd()) {
|
||||
const QString chunk = words.join(" ");
|
||||
QSqlQuery q;
|
||||
if (!addChunk(q,
|
||||
document_id,
|
||||
++chunk_id,
|
||||
chunk,
|
||||
0 /*embedding_id*/,
|
||||
QString() /*embedding_path*/
|
||||
)) {
|
||||
qWarning() << "ERROR: Could not insert chunk into db" << q.lastError();
|
||||
}
|
||||
words.clear();
|
||||
charCount = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -466,7 +511,18 @@ void Database::scanQueue()
|
||||
if (m_docsToScan.isEmpty())
|
||||
return;
|
||||
|
||||
const DocumentInfo info = m_docsToScan.dequeue();
|
||||
DocumentInfo info = m_docsToScan.dequeue();
|
||||
|
||||
// Update info
|
||||
info.doc.stat();
|
||||
|
||||
// If the doc has since been deleted or no longer readable, then we schedule more work and return
|
||||
// leaving the cleanup for the cleanup handler
|
||||
if (!info.doc.exists() || !info.doc.isReadable()) {
|
||||
if (!m_docsToScan.isEmpty()) QTimer::singleShot(0, this, &Database::scanQueue);
|
||||
return;
|
||||
}
|
||||
|
||||
const int folder_id = info.folder;
|
||||
const qint64 document_time = info.doc.fileTime(QFile::FileModificationTime).toMSecsSinceEpoch();
|
||||
const QString document_path = info.doc.canonicalFilePath();
|
||||
@ -565,7 +621,6 @@ void Database::scanDocuments(int folder_id, const QString &folder_path)
|
||||
while (it.hasNext()) {
|
||||
it.next();
|
||||
QFileInfo fileInfo = it.fileInfo();
|
||||
fileInfo.setCaching(false);
|
||||
if (fileInfo.isDir()) {
|
||||
addFolderToWatch(fileInfo.canonicalFilePath());
|
||||
continue;
|
||||
@ -663,7 +718,13 @@ void Database::removeFolder(const QString &collection, const QString &path)
|
||||
return;
|
||||
}
|
||||
|
||||
removeFolderInternal(collection, folder_id, path);
|
||||
}
|
||||
|
||||
void Database::removeFolderInternal(const QString &collection, int folder_id, const QString &path)
|
||||
{
|
||||
// Determine if the folder is used by more than one collection
|
||||
QSqlQuery q;
|
||||
QList<QString> collections;
|
||||
if (!selectCollectionsFromFolder(q, folder_id, &collections)) {
|
||||
qWarning() << "ERROR: Cannot select collections from folder" << folder_id << q.lastError();
|
||||
@ -771,6 +832,73 @@ void Database::retrieveFromDB(const QList<QString> &collections, const QString &
|
||||
emit retrieveResult(results);
|
||||
}
|
||||
|
||||
void Database::cleanDB()
|
||||
{
|
||||
#if defined(DEBUG)
|
||||
qDebug() << "cleanDB";
|
||||
#endif
|
||||
|
||||
// Scan all folders in db to make sure they still exist
|
||||
QSqlQuery q;
|
||||
QList<QPair<QString, int>> collections;
|
||||
if (!selectAllFromCollections(q, &collections)) {
|
||||
qWarning() << "ERROR: Cannot select collections" << q.lastError();
|
||||
return;
|
||||
}
|
||||
|
||||
for (auto pair : collections) {
|
||||
// Find the path for the folder
|
||||
QString collection = pair.first;
|
||||
int folder_id = pair.second;
|
||||
QString folder_path;
|
||||
if (!selectFolder(q, folder_id, &folder_path)) {
|
||||
qWarning() << "ERROR: Cannot select folder from id" << folder_id << q.lastError();
|
||||
return;
|
||||
}
|
||||
|
||||
QFileInfo info(folder_path);
|
||||
if (!info.exists() || !info.isReadable()) {
|
||||
#if defined(DEBUG)
|
||||
qDebug() << "clean db removing folder" << folder_id << folder_path;
|
||||
#endif
|
||||
removeFolderInternal(collection, folder_id, folder_path);
|
||||
}
|
||||
}
|
||||
|
||||
// Scan all documents in db to make sure they still exist
|
||||
if (!q.prepare(SELECT_ALL_DOCUMENTS_SQL)) {
|
||||
qWarning() << "ERROR: Cannot prepare sql for select all documents" << q.lastError();
|
||||
return;
|
||||
}
|
||||
|
||||
if (!q.exec()) {
|
||||
qWarning() << "ERROR: Cannot exec sql for select all documents" << q.lastError();
|
||||
return;
|
||||
}
|
||||
|
||||
while (q.next()) {
|
||||
int document_id = q.value(0).toInt();
|
||||
QString document_path = q.value(1).toString();
|
||||
QFileInfo info(document_path);
|
||||
if (info.exists() && info.isReadable())
|
||||
continue;
|
||||
|
||||
#if defined(DEBUG)
|
||||
qDebug() << "clean db removing document" << document_id << document_path;
|
||||
#endif
|
||||
|
||||
// Remove all chunks and documents that either don't exist or have become unreadable
|
||||
QSqlQuery query;
|
||||
if (!removeChunksByDocumentId(query, document_id)) {
|
||||
qWarning() << "ERROR: Cannot remove chunks of document_id" << document_id << query.lastError();
|
||||
}
|
||||
|
||||
if (!removeDocument(query, document_id)) {
|
||||
qWarning() << "ERROR: Cannot remove document_id" << document_id << query.lastError();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Database::directoryChanged(const QString &path)
|
||||
{
|
||||
#if defined(DEBUG)
|
||||
@ -794,6 +922,9 @@ void Database::directoryChanged(const QString &path)
|
||||
return;
|
||||
}
|
||||
|
||||
// Clean the database
|
||||
cleanDB();
|
||||
|
||||
// Rescan the documents associated with the folder
|
||||
scanDocuments(folder_id, path);
|
||||
}
|
||||
|
@ -26,6 +26,7 @@ public Q_SLOTS:
|
||||
void addFolder(const QString &collection, const QString &path);
|
||||
void removeFolder(const QString &collection, const QString &path);
|
||||
void retrieveFromDB(const QList<QString> &collections, const QString &text);
|
||||
void cleanDB();
|
||||
|
||||
Q_SIGNALS:
|
||||
void docsToScanChanged();
|
||||
@ -38,6 +39,7 @@ private Q_SLOTS:
|
||||
bool removeFolderFromWatch(const QString &path);
|
||||
|
||||
private:
|
||||
void removeFolderInternal(const QString &collection, int folder_id, const QString &path);
|
||||
void chunkStream(QTextStream &stream, int document_id);
|
||||
void handleDocumentErrorAndScheduleNext(const QString &errorMessage,
|
||||
int document_id, const QString &document_path, const QSqlError &error);
|
||||
|
Loading…
Reference in New Issue
Block a user