From 459289b94cb4a0db4bf89e3b1ec4aab9d1684c04 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Fri, 12 Apr 2024 10:54:15 -0400 Subject: [PATCH] embed4all: small fixes related to nomic client local embeddings (#2213) * actually submit larger batches with increased n_ctx * fix crash when llama_tokenize returns no tokens Signed-off-by: Jared Van Bortel --- gpt4all-backend/llamamodel.cpp | 21 +++++++++++++++------ gpt4all-bindings/python/setup.py | 2 +- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp index 277f44e7..c8bfaa77 100644 --- a/gpt4all-backend/llamamodel.cpp +++ b/gpt4all-backend/llamamodel.cpp @@ -325,7 +325,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl) bool isEmbedding = is_embedding_arch(llama_model_arch(d_ptr->model)); const int n_ctx_train = llama_n_ctx_train(d_ptr->model); if (isEmbedding) { - d_ptr->ctx_params.n_batch = n_ctx_train; + d_ptr->ctx_params.n_batch = n_ctx; } else { if (n_ctx > n_ctx_train) { std::cerr << "warning: model was trained on only " << n_ctx_train << " context tokens (" @@ -734,7 +734,7 @@ void LLamaModel::embedInternal( ) { typedef std::vector TokenString; static constexpr int32_t atlasMaxLength = 8192; - static constexpr int chunkOverlap = 8; // Atlas overlaps n_batch-sized chunks of input by 8 tokens + static constexpr int chunkOverlap = 8; // Atlas overlaps chunks of input by 8 tokens const llama_token bos_token = llama_token_bos(d_ptr->model); const llama_token eos_token = llama_token_eos(d_ptr->model); @@ -751,8 +751,12 @@ void LLamaModel::embedInternal( tokens.resize(text.length()+4); int32_t n_tokens = llama_tokenize(d_ptr->model, text.c_str(), text.length(), tokens.data(), tokens.size(), wantBOS, false); - assert(useEOS == (eos_token != -1 && tokens[n_tokens - 1] == eos_token)); - tokens.resize(n_tokens - useEOS); // erase EOS/SEP + if (n_tokens) { + assert(useEOS == (eos_token != -1 && tokens[n_tokens - 1] == eos_token)); + tokens.resize(n_tokens - useEOS); // erase EOS/SEP + } else { + tokens.clear(); + } }; // tokenize the texts @@ -786,9 +790,14 @@ void LLamaModel::embedInternal( tokenize(prefix + ':', prefixTokens, true); } + // n_ctx_train: max sequence length of model (RoPE scaling not implemented) + const uint32_t n_ctx_train = llama_n_ctx_train(d_ptr->model); + // n_batch (equals n_ctx): max tokens per call to llama_decode (one more more sequences) const uint32_t n_batch = llama_n_batch(d_ptr->ctx); - const uint32_t max_len = n_batch - (prefixTokens.size() + useEOS); // minus BOS/CLS and EOS/SEP - if (chunkOverlap >= max_len) { + + // effective sequence length minus prefix and SEP token + const uint32_t max_len = std::min(n_ctx_train, n_batch) - (prefixTokens.size() + useEOS); + if (max_len <= chunkOverlap) { throw std::logic_error("max chunk length of " + std::to_string(max_len) + " is smaller than overlap of " + std::to_string(chunkOverlap) + " tokens"); } diff --git a/gpt4all-bindings/python/setup.py b/gpt4all-bindings/python/setup.py index fc44b256..e632fd97 100644 --- a/gpt4all-bindings/python/setup.py +++ b/gpt4all-bindings/python/setup.py @@ -68,7 +68,7 @@ def get_long_description(): setup( name=package_name, - version="2.4.0", + version="2.4.1", description="Python bindings for GPT4All", long_description=get_long_description(), long_description_content_type="text/markdown",