llamamodel: fix BERT tokenization after llama.cpp update (#2381)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel 2024-05-28 13:11:57 -04:00 committed by GitHub
parent 0b63ad5eff
commit f1b4092ca6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 6 additions and 6 deletions

View File

@ -920,11 +920,11 @@ void LLamaModel::embedInternal(
int32_t n_tokens = llama_tokenize(d_ptr->model, text.c_str(), text.length(), tokens.data(), tokens.size(), wantBOS, false); int32_t n_tokens = llama_tokenize(d_ptr->model, text.c_str(), text.length(), tokens.data(), tokens.size(), wantBOS, false);
if (n_tokens) { if (n_tokens) {
(void)eos_token; (void)eos_token;
assert(useEOS == (eos_token != -1 && tokens[n_tokens - 1] == eos_token)); assert((useEOS && wantBOS) == (eos_token != -1 && tokens[n_tokens - 1] == eos_token));
tokens.resize(n_tokens - useEOS); // erase EOS/SEP if (useEOS && wantBOS)
} else { n_tokens--; // erase EOS/SEP
tokens.clear();
} }
tokens.resize(n_tokens);
}; };
// tokenize the texts // tokenize the texts

View File

@ -938,7 +938,7 @@ void Database::start()
connect(m_embLLM, &EmbeddingLLM::errorGenerated, this, &Database::handleErrorGenerated); connect(m_embLLM, &EmbeddingLLM::errorGenerated, this, &Database::handleErrorGenerated);
m_scanTimer->callOnTimeout(this, &Database::scanQueue); m_scanTimer->callOnTimeout(this, &Database::scanQueue);
if (!QSqlDatabase::drivers().contains("QSQLITE")) { if (!QSqlDatabase::drivers().contains("QSQLITE")) {
qWarning() << "ERROR: missing sqllite driver"; qWarning() << "ERROR: missing sqlite driver";
} else { } else {
QSqlError err = initDb(); QSqlError err = initDb();
if (err.type() != QSqlError::NoError) if (err.type() != QSqlError::NoError)

View File

@ -229,7 +229,7 @@ Raw Data:
- Explorer: https://atlas.nomic.ai/map/gpt4all_data_clean - Explorer: https://atlas.nomic.ai/map/gpt4all_data_clean
- [GPT4All-J Dataset](https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations) - [GPT4All-J Dataset](https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations)
- Explorer Indexed on Prompts: https://atlas.nomic.ai/map/gpt4all-j-prompts-curated - Explorer Indexed on Prompts: https://atlas.nomic.ai/map/gpt4all-j-prompts-curated
- Exporer Indexed on Responses: https://atlas.nomic.ai/map/gpt4all-j-response-curated - Explorer Indexed on Responses: https://atlas.nomic.ai/map/gpt4all-j-response-curated
We are not distributing a LLaMa 7B checkpoint. We are not distributing a LLaMa 7B checkpoint.