backend: rebase llama.cpp on upstream as of Sep 26th (#2998)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2024-10-01 01:06:10 -04:00 · 2024-09-27 12:05:59 -04:00 · 2024-09-27 12:05:59 -04:00 · f9d6be8afb
commit f9d6be8afb
parent 8bd937eb68
16 changed files with 165 additions and 600 deletions
--- a/gpt4all-backend/deps/llama.cpp-mainline
+++ b/gpt4all-backend/deps/llama.cpp-mainline
@ -1 +1 @@
-Subproject commit ced74fbad4b258507f3ec06e77eec9445583511a
+Subproject commit b3b5c0571eda3065035a7f25f7b84640b159d821
--- a/gpt4all-backend/include/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/include/gpt4all-backend/llmodel.h
@ -7,6 +7,7 @@
 #include <cstdint>
 #include <functional>
 #include <optional>
+#include <span>
 #include <stdexcept>
 #include <string>
 #include <string_view>
@ -149,9 +150,9 @@ public:
    virtual bool isEmbeddingModel(const std::string &modelPath) const { (void)modelPath; return false; }
    virtual bool isModelLoaded() const = 0;
    virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
-    virtual size_t stateSize() const { return 0; }
-    virtual size_t saveState(uint8_t *dest) const { (void)dest; return 0; }
-    virtual size_t restoreState(const uint8_t *src) { (void)src; return 0; }
+    virtual size_t stateSize() const = 0;
+    virtual size_t saveState(std::span<uint8_t> dest) const = 0;
+    virtual size_t restoreState(std::span<const uint8_t> src) = 0;

    // This method requires the model to return true from supportsCompletion otherwise it will throw
    // an error
@ -215,7 +216,8 @@ protected:
    virtual std::vector<Token> tokenize(PromptContext &ctx, std::string_view str, bool special = false) = 0;
    virtual bool isSpecialToken(Token id) const = 0;
    virtual std::string tokenToString(Token id) const = 0;
-    virtual Token sampleToken(PromptContext &ctx) const = 0;
+    virtual void initSampler(PromptContext &ctx) = 0;
+    virtual Token sampleToken() const = 0;
    virtual bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const = 0;
    virtual void shiftContext(PromptContext &promptCtx) = 0;
    virtual int32_t contextLength() const = 0;
--- a/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
@ -148,18 +148,20 @@ uint64_t llmodel_get_state_size(llmodel_model model);
 * NOTE: This state data is specific to the type of model you have created.
 * @param model A pointer to the llmodel_model instance.
 * @param dest A pointer to the destination.
- * @return the number of bytes copied
+ * @param size The size of the destination buffer.
+ * @return the number of bytes copied, or zero on error.
 */
-uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest);
+uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest, uint64_t size);

 /**
 * Restores the internal state of the model using data from the specified address.
 * NOTE: This state data is specific to the type of model you have created.
 * @param model A pointer to the llmodel_model instance.
- * @param src A pointer to the src.
- * @return the number of bytes read
+ * @param src A pointer to the state data.
+ * @param size The size of the source data.
+ * @return The number of bytes read, or zero on error.
 */
-uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src);
+uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src, size_t size);

 /**
 * Generate a response using the model.
--- a/gpt4all-backend/llama.cpp.cmake
+++ b/gpt4all-backend/llama.cpp.cmake
@ -978,10 +978,13 @@ function(include_ggml SUFFIX)

    add_library(llama${SUFFIX} STATIC
                ${DIRECTORY}/include/llama.h
+                ${DIRECTORY}/src/llama-grammar.cpp
+                ${DIRECTORY}/src/llama-sampling.cpp
+                ${DIRECTORY}/src/llama-vocab.cpp
                ${DIRECTORY}/src/llama.cpp
-                ${DIRECTORY}/src/unicode.h
-                ${DIRECTORY}/src/unicode.cpp
                ${DIRECTORY}/src/unicode-data.cpp
+                ${DIRECTORY}/src/unicode.cpp
+                ${DIRECTORY}/src/unicode.h
                )

    target_include_directories(llama${SUFFIX} PUBLIC  ${DIRECTORY}/include ${DIRECTORY}/ggml/include)
--- a/gpt4all-backend/src/llamamodel.cpp
+++ b/gpt4all-backend/src/llamamodel.cpp
@ -2,6 +2,7 @@
 #include "llamamodel_impl.h"

 #include "llmodel.h"
+#include "utils.h"

 #include <ggml.h>
 #include <llama.h>
@ -103,26 +104,34 @@ static bool llama_verbose()
    return var && *var;
 }

-static void llama_log_callback(enum ggml_log_level level, const char *text, void *userdata)
+static void llama_log_callback(ggml_log_level level, const char *text, void *userdata, bool warn)
 {
    (void)userdata;
-    if (llama_verbose() || level <= GGML_LOG_LEVEL_ERROR) {
-        fputs(text, stderr);
+
+    static ggml_log_level lastlevel = GGML_LOG_LEVEL_NONE;
+    if (!llama_verbose()) {
+        auto efflevel = level == GGML_LOG_LEVEL_CONT ? lastlevel : level;
+        lastlevel = efflevel;
+        switch (efflevel) {
+            case GGML_LOG_LEVEL_CONT:
+                UNREACHABLE();
+                break;
+            case GGML_LOG_LEVEL_WARN:
+                if (warn) break;
+                [[fallthrough]];
+            case GGML_LOG_LEVEL_NONE: // not used?
+            case GGML_LOG_LEVEL_INFO:
+            case GGML_LOG_LEVEL_DEBUG:
+                return; // suppress
+            case GGML_LOG_LEVEL_ERROR:
+                ;
        }
    }

-#ifdef GGML_USE_CUDA
-static void cuda_log_callback(enum ggml_log_level level, const char *text, void *userdata)
-{
-    (void)userdata;
-    if (llama_verbose() || level <= GGML_LOG_LEVEL_WARN) {
    fputs(text, stderr);
 }
-}
-#endif

 struct gpt_params {
-    int32_t seed          = -1;   // RNG seed
    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt

    // sampling parameters
@ -137,44 +146,6 @@ struct gpt_params {
    bool use_mlock         = false; // use mlock to keep model in memory
 };

-static llama_token llama_sample_top_p_top_k(
-        llama_context *ctx,
-        const llama_token *last_n_tokens_data,
-        int last_n_tokens_size,
-        int top_k,
-        float top_p,
-        float min_p,
-        float temp,
-        float repeat_penalty) {
-    auto logits = llama_get_logits_ith(ctx, -1);
-    auto n_vocab = llama_n_vocab(llama_get_model(ctx));
-    // Populate initial list of all candidates
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (int token_id = 0; token_id < n_vocab; token_id++) {
-        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-    }
-    llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
-    // Sample repeat penalty
-    llama_sample_repetition_penalties(nullptr, &candidates_p, last_n_tokens_data, last_n_tokens_size, repeat_penalty, 0.0f, 0.0f);
-
-    llama_token id;
-    if (temp == 0.0) {
-        // greedy sampling, no probs
-        id = llama_sample_token_greedy(ctx, &candidates_p);
-    } else {
-        // temperature sampling
-        llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-        llama_sample_tail_free(ctx, &candidates_p, 1.0f, 1);
-        llama_sample_typical(ctx, &candidates_p, 1.0f, 1);
-        llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-        llama_sample_min_p(ctx, &candidates_p, min_p, 1);
-        llama_sample_temp(ctx, &candidates_p, temp);
-        id = llama_sample_token(ctx, &candidates_p);
-    }
-    return id;
-}
-
 const char *get_arch_name(gguf_context *ctx_gguf)
 {
    const int kid = gguf_find_key(ctx_gguf, "general.architecture");
@ -241,21 +212,26 @@ cleanup:
 }

 struct LLamaPrivate {
-    const std::string modelPath;
    bool                         modelLoaded  = false;
    int                          device       = -1;
    std::string                  deviceName;
+    int64_t                      n_threads    = 0;
+    std::vector<LLModel::Token>  end_tokens;
+    const char                  *backend_name = nullptr;
+
    llama_model          *model        = nullptr;
    llama_context        *ctx          = nullptr;
    llama_model_params    model_params;
    llama_context_params  ctx_params;
-    int64_t n_threads = 0;
-    std::vector<LLModel::Token> end_tokens;
-    const char *backend_name = nullptr;
+    llama_sampler        *sampler_chain;
 };

 LLamaModel::LLamaModel()
-    : d_ptr(new LLamaPrivate) {}
+    : d_ptr(std::make_unique<LLamaPrivate>())
+{
+    auto sparams = llama_sampler_chain_default_params();
+    d_ptr->sampler_chain = llama_sampler_chain_init(sparams);
+}

 // default hparams (LLaMA 7B)
 struct llama_file_hparams {
@ -445,7 +421,6 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
    }

    d_ptr->ctx_params.n_ctx  = n_ctx;
-    d_ptr->ctx_params.seed    = params.seed;
    d_ptr->ctx_params.type_k = params.kv_type;
    d_ptr->ctx_params.type_v = params.kv_type;

@ -513,6 +488,7 @@ LLamaModel::~LLamaModel()
        llama_free(d_ptr->ctx);
    }
    llama_free_model(d_ptr->model);
+    llama_sampler_free(d_ptr->sampler_chain);
 }

 bool LLamaModel::isModelLoaded() const
@ -522,18 +498,17 @@ bool LLamaModel::isModelLoaded() const

 size_t LLamaModel::stateSize() const
 {
-    return llama_get_state_size(d_ptr->ctx);
+    return llama_state_get_size(d_ptr->ctx);
 }

-size_t LLamaModel::saveState(uint8_t *dest) const
+size_t LLamaModel::saveState(std::span<uint8_t> dest) const
 {
-    return llama_copy_state_data(d_ptr->ctx, dest);
+    return llama_state_get_data(d_ptr->ctx, dest.data(), dest.size());
 }

-size_t LLamaModel::restoreState(const uint8_t *src)
+size_t LLamaModel::restoreState(std::span<const uint8_t> src)
 {
-    // const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540
-    return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
+    return llama_state_set_data(d_ptr->ctx, src.data(), src.size());
 }

 std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, std::string_view str, bool special)
@ -573,13 +548,50 @@ std::string LLamaModel::tokenToString(Token id) const
    return std::string(result.data(), result.size());
 }

-LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
+void LLamaModel::initSampler(PromptContext &promptCtx)
 {
-    const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
-    return llama_sample_top_p_top_k(d_ptr->ctx,
-        promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
-        n_prev_toks, promptCtx.top_k, promptCtx.top_p, promptCtx.min_p, promptCtx.temp,
-        promptCtx.repeat_penalty);
+    auto *model = d_ptr->model;
+    auto *chain = d_ptr->sampler_chain;
+
+    // clear sampler chain
+    for (int i = llama_sampler_chain_n(chain) - 1; i >= 0; i--) {
+        auto *smpl = llama_sampler_chain_remove(chain, i);
+        llama_sampler_free(smpl);
+    }
+
+    // build new chain
+    llama_sampler_chain_add(chain,
+        llama_sampler_init_penalties(
+            llama_n_vocab(model),
+            llama_token_eos(model),
+            llama_token_nl(model),
+            promptCtx.repeat_last_n,
+            promptCtx.repeat_penalty,
+            // TODO(jared): consider making the below configurable
+            /*penalty_freq*/    0.0f,
+            /*penalty_present*/ 0.0f,
+            /*penalize_nl*/     true,
+            /*ignore_eos*/      false
+        )
+    );
+    if (promptCtx.temp == 0.0f) {
+        llama_sampler_chain_add(chain, llama_sampler_init_greedy());
+    } else {
+        struct llama_sampler *samplers[] = {
+            llama_sampler_init_top_k(promptCtx.top_k),
+            llama_sampler_init_top_p(promptCtx.top_p, 1),
+            llama_sampler_init_min_p(promptCtx.min_p, 1),
+            llama_sampler_init_temp(promptCtx.temp),
+            llama_sampler_init_dist(LLAMA_DEFAULT_SEED)
+        };
+        for (auto *smpl : samplers)
+            llama_sampler_chain_add(chain, smpl);
+    }
+}
+
+LLModel::Token LLamaModel::sampleToken() const
+{
+    return llama_sampler_sample(d_ptr->sampler_chain, d_ptr->ctx, -1);
 }

 bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
@ -1227,9 +1239,9 @@ DLL_EXPORT bool is_arch_supported(const char *arch)

 DLL_EXPORT LLModel *construct()
 {
-    llama_log_set(llama_log_callback, nullptr);
+    llama_log_set([](auto l, auto t, auto u) { llama_log_callback(l, t, u, false); }, nullptr);
 #ifdef GGML_USE_CUDA
-    ggml_backend_cuda_log_set_callback(cuda_log_callback, nullptr);
+    ggml_backend_cuda_log_set_callback([](auto l, auto t, auto u) { llama_log_callback(l, t, u, true); }, nullptr);
 #endif
    return new LLamaModel;
 }
--- a/gpt4all-backend/src/llamamodel_impl.h
+++ b/gpt4all-backend/src/llamamodel_impl.h
@ -7,6 +7,7 @@
 #include "llmodel.h"

 #include <memory>
+#include <span>
 #include <string>
 #include <string_view>
 #include <vector>
@ -27,8 +28,8 @@ public:
    bool isModelLoaded() const override;
    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
    size_t stateSize() const override;
-    size_t saveState(uint8_t *dest) const override;
-    size_t restoreState(const uint8_t *src) override;
+    size_t saveState(std::span<uint8_t> dest) const override;
+    size_t restoreState(std::span<const uint8_t> src) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;
    std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
@ -56,7 +57,8 @@ protected:
    std::vector<Token> tokenize(PromptContext &ctx, std::string_view str, bool special) override;
    bool isSpecialToken(Token id) const override;
    std::string tokenToString(Token id) const override;
-    Token sampleToken(PromptContext &ctx) const override;
+    void initSampler(PromptContext &ctx) override;
+    Token sampleToken() const override;
    bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
    void shiftContext(PromptContext &promptCtx) override;
    int32_t contextLength() const override;
--- a/gpt4all-backend/src/llmodel_c.cpp
+++ b/gpt4all-backend/src/llmodel_c.cpp
@ -91,16 +91,16 @@ uint64_t llmodel_get_state_size(llmodel_model model)
    return wrapper->llModel->stateSize();
 }

-uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest)
+uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest, uint64_t size)
 {
    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->saveState(dest);
+    return wrapper->llModel->saveState({dest, size_t(size)});
 }

-uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src)
+uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src, uint64_t size)
 {
    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->restoreState(src);
+    return wrapper->llModel->restoreState({src, size_t(size)});
 }

 void llmodel_prompt(llmodel_model model, const char *prompt,
--- a/gpt4all-backend/src/llmodel_shared.cpp
+++ b/gpt4all-backend/src/llmodel_shared.cpp
@ -244,6 +244,8 @@ void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)>
        return;
    }

+    initSampler(promptCtx);
+
    std::string cachedResponse;
    std::vector<Token> cachedTokens;
    int n_predicted = 0;
@ -251,7 +253,7 @@ void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)>
    // Predict next tokens
    for (bool stop = false; !stop;) {
        // Sample next token
-        std::optional<Token> new_tok = sampleToken(promptCtx);
+        std::optional<Token> new_tok = sampleToken();
        std::string new_piece = tokenToString(new_tok.value());
        cachedTokens.push_back(new_tok.value());
        cachedResponse += new_piece;
--- a/gpt4all-backend/src/llmodel_shared.h
+++ b/gpt4all-backend/src/llmodel_shared.h
@ -1,49 +0,0 @@
-#pragma once
-
-#include <ggml.h>
-
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-
-struct llm_buffer {
-    uint8_t * addr = NULL;
-    size_t size = 0;
-
-    void resize(size_t size) {
-        delete[] addr;
-        addr = new uint8_t[size];
-        this->size = size;
-    }
-
-    ~llm_buffer() {
-        delete[] addr;
-    }
-};
-
-struct llm_kv_cache {
-    struct ggml_tensor * k;
-    struct ggml_tensor * v;
-
-    struct ggml_context * ctx = NULL;
-
-    llm_buffer buf;
-
-    int n; // number of tokens currently in the cache
-
-    ~llm_kv_cache() {
-        if (ctx) {
-            ggml_free(ctx);
-        }
-    }
-};
-
-inline void ggml_graph_compute_g4a(llm_buffer& buf, ggml_cgraph * graph, int n_threads)
-{
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.addr;
-    }
-    ggml_graph_compute(graph, &plan);
-}
--- a/gpt4all-backend/src/utils.cpp
+++ b/gpt4all-backend/src/utils.cpp
@ -1,339 +0,0 @@
-#include "utils.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <fstream>
-#include <iterator>
-#include <regex>
-#include <utility>
-
-void replace(std::string & str, const std::string & needle, const std::string & replacement)
-{
-    size_t pos = 0;
-    while ((pos = str.find(needle, pos)) != std::string::npos) {
-        str.replace(pos, needle.length(), replacement);
-        pos += replacement.length();
-    }
-}
-
-std::map<std::string, int32_t> json_parse(const std::string & fname)
-{
-    std::map<std::string, int32_t> result;
-
-    // read file into string
-    std::string json;
-    {
-        std::ifstream ifs(fname);
-        if (!ifs) {
-            fprintf(stderr, "Failed to open %s\n", fname.c_str());
-            exit(1);
-        }
-
-        json = std::string((std::istreambuf_iterator<char>(ifs)),
-                (std::istreambuf_iterator<char>()));
-    }
-
-    if (json[0] != '{') {
-        return result;
-    }
-
-    // parse json
-    {
-        bool has_key  = false;
-        bool in_token = false;
-
-        std::string str_key = "";
-        std::string str_val = "";
-
-        int n = json.size();
-        for (int i = 1; i < n; ++i) {
-            if (!in_token) {
-                if (json[i] == ' ') continue;
-                if (json[i] == '"') {
-                    in_token = true;
-                    continue;
-                }
-            } else {
-                if (json[i] == '\\' && i+1 < n) {
-                    if (has_key == false) {
-                        str_key += json[i];
-                    } else {
-                        str_val += json[i];
-                    }
-                    ++i;
-                } else if (json[i] == '"') {
-                    if (has_key == false) {
-                        has_key = true;
-                        ++i;
-                        while (json[i] == ' ') ++i;
-                        ++i; // :
-                        while (json[i] == ' ') ++i;
-                        if (json[i] != '\"') {
-                            while (json[i] != ',' && json[i] != '}') {
-                                str_val += json[i++];
-                            }
-                            has_key = false;
-                        } else {
-                            in_token = true;
-                            continue;
-                        }
-                    } else {
-                        has_key = false;
-                    }
-
-                    ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
-                    ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
-                    ::replace(str_key, "\\\"",    "\""); // \\\"   -> "
-
-                    try {
-                        result[str_key] = std::stoi(str_val);
-                    } catch (...) {
-                        //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
-
-                    }
-                    str_key = "";
-                    str_val = "";
-                    in_token = false;
-                    continue;
-                }
-                if (has_key == false) {
-                    str_key += json[i];
-                } else {
-                    str_val += json[i];
-                }
-            }
-        }
-    }
-
-    return result;
-}
-
-std::vector<gpt_vocab::id> gpt_tokenize_inner(const gpt_vocab & vocab, const std::string & text)
-{
-    std::vector<std::string> words;
-
-    // first split the text into words
-    {
-        std::string str = text;
-        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
-
-        std::regex re(pat);
-        std::smatch m;
-
-        while (std::regex_search(str, m, re)) {
-            for (auto x : m) {
-                words.push_back(x);
-            }
-            str = m.suffix();
-        }
-    }
-
-    // find the longest tokens that form the words:
-    std::vector<gpt_vocab::id> tokens;
-    for (const auto & word : words) {
-        if (word.size() == 0) continue;
-
-        int i = 0;
-        int n = word.size();
-        while (i < n) {
-            int j = n;
-            while (j > i) {
-                auto it = vocab.token_to_id.find(word.substr(i, j-i));
-                if (it != vocab.token_to_id.end()) {
-                    tokens.push_back(it->second);
-                    i = j;
-                    break;
-                }
-                --j;
-            }
-            if (i == n) {
-                break;
-            }
-            if (j == i) {
-                auto sub = word.substr(i, 1);
-                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
-                    tokens.push_back(vocab.token_to_id.at(sub));
-                } else {
-                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
-                }
-                ++i;
-            }
-        }
-    }
-
-    return tokens;
-}
-
-std::string regex_escape(const std::string &s)
-{
-  static const std::regex metacharacters(R"([\.\^\$\-\+\(\)\[\]\{\}\|\?\*])");
-  return std::regex_replace(s, metacharacters, "\\$&");
-}
-
-std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text)
-{
-    // Generate the subpattern from the special_tokens vector if it's not empty
-    if (!vocab.special_tokens.empty()) {
-        std::vector<gpt_vocab::id> out;
-        std::vector<std::string> chunks;
-        std::string str = text;
-        std::string special_tokens_subpattern;
-        for (const auto &token : vocab.special_tokens) {
-            if (!special_tokens_subpattern.empty()) {
-                special_tokens_subpattern += "|";
-            }
-            special_tokens_subpattern += regex_escape(token);
-        }
-        std::regex re(special_tokens_subpattern);
-        std::smatch m;
-        while (std::regex_search(str, m, re)) {
-            auto tok = vocab.token_to_id.find(m.str());
-            if (tok != vocab.token_to_id.end()) {
-                auto tokid = tok->second;
-                auto pfxtoks = gpt_tokenize_inner(vocab, m.prefix());
-                out.insert(out.end(), pfxtoks.begin(), pfxtoks.end());
-                out.push_back(tokid);
-                str = m.suffix();
-            }
-        }
-        if (!str.empty()) {
-            auto tokrest = gpt_tokenize_inner(vocab, str);
-            out.insert(out.end(), tokrest.begin(), tokrest.end());
-        }
-        return out;
-    } else {
-        return gpt_tokenize_inner(vocab, text);
-    }
-}
-
-
-bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab)
-{
-    printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
-
-    vocab.token_to_id = ::json_parse(fname);
-
-    for (const auto & kv : vocab.token_to_id) {
-        vocab.id_to_token[kv.second] = kv.first;
-    }
-
-    printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
-
-    // print the vocabulary
-    //for (auto kv : vocab.token_to_id) {
-    //    printf("'%s' -> %d\n", kv.first.data(), kv.second);
-    //}
-
-    return true;
-}
-
-gpt_vocab::id gpt_sample_top_k_top_p(
-        const size_t actualVocabSize,
-        const int32_t * last_n_tokens_data,
-        int   last_n_tokens_size,
-        const std::vector<float> logits,
-        int    top_k,
-        double top_p,
-        double temp,
-        float repeat_penalty,
-        std::mt19937 & rng) {
-    int n_logits = actualVocabSize;
-
-    const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
-    const auto * plogits = logits.data();
-
-    if (temp <= 0) {
-        // select the token with the highest logit directly
-        float max_logit = plogits[0];
-        gpt_vocab::id max_id = 0;
-
-        for (int i = 1; i < n_logits; ++i) {
-            if (plogits[i] > max_logit) {
-                max_logit = plogits[i];
-                max_id = i;
-            }
-        }
-        return max_id;
-    }
-    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
-    logits_id.reserve(n_logits);
-
-    {
-        const float scale = 1.0f/temp;
-        for (int i = 0; i < n_logits; ++i) {
-            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
-            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
-            if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
-                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                if (plogits[i] < 0.0f) {
-                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
-                } else {
-                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
-                }
-            } else {
-                logits_id.push_back(std::make_pair(plogits[i]*scale, i));
-            }
-        }
-    }
-
-    // find the top K tokens
-    std::partial_sort(
-            logits_id.begin(),
-            logits_id.begin() + top_k, logits_id.end(),
-            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
-        return a.first > b.first;
-    });
-
-    logits_id.resize(top_k);
-
-    double maxl = -INFINITY;
-    for (const auto & kv : logits_id) {
-        maxl = std::max(maxl, kv.first);
-    }
-
-    // compute probs for the top K tokens
-    std::vector<double> probs;
-    probs.reserve(logits_id.size());
-
-    double sum = 0.0;
-    for (const auto & kv : logits_id) {
-        double p = exp(kv.first - maxl);
-        probs.push_back(p);
-        sum += p;
-    }
-
-    // normalize the probs
-    for (auto & p : probs) {
-        p /= sum;
-    }
-
-    if (top_p < 1.0f) {
-        double cumsum = 0.0f;
-        for (int i = 0; i < top_k; i++) {
-            cumsum += probs[i];
-            if (cumsum >= top_p) {
-                top_k = i + 1;
-                probs.resize(top_k);
-                logits_id.resize(top_k);
-                break;
-            }
-        }
-
-        cumsum = 1.0/cumsum;
-        for (int i = 0; i < (int) probs.size(); i++) {
-            probs[i] *= cumsum;
-        }
-    }
-
-    //printf("\n");
-    //for (int i = 0; i < (int) probs.size(); i++) {
-    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
-    //}
-    //exit(0);
-
-    std::discrete_distribution<> dist(probs.begin(), probs.end());
-    int idx = dist(rng);
-
-    return logits_id[idx].second;
-}
--- a/gpt4all-backend/src/utils.h
+++ b/gpt4all-backend/src/utils.h
@ -1,101 +1,17 @@
-// Various helper functions and utilities
-
 #pragma once

-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <map>
-#include <random>
-#include <string>
-#include <thread>
-#include <vector>
+#include <cassert>

-//
-// General purpose inline functions
-//
-constexpr inline unsigned long long operator ""_MiB(unsigned long long bytes)
-{
-    return bytes*1024*1024;
-}
-
-//
-// CLI argument parsing
-//
-
-struct gpt_params {
-    int32_t seed      = -1; // RNG seed
-    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_predict = 200; // new tokens to predict
-
-    // sampling parameters
-    int32_t top_k = 40;
-    float   top_p = 0.9f;
-    float   temp  = 0.9f;
-
-    int32_t n_batch = 8; // batch size for prompt processing
-
-    std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
-    std::string prompt;
-};
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
-
-void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
-
-std::string gpt_random_prompt(std::mt19937 & rng);
-
-//
-// Vocab utils
-//
-
-struct gpt_vocab {
-    using id    = int32_t;
-    using token = std::string;
-
-    std::map<token, id> token_to_id;
-    std::map<id, token> id_to_token;
-    std::vector<std::string> special_tokens;
-
-    void add_special_token(const std::string &token) {
-        special_tokens.push_back(token);
-    }
-};
-
-void replace(std::string & str, const std::string & needle, const std::string & replacement);
-
-// poor-man's JSON parsing
-std::map<std::string, int32_t> json_parse(const std::string & fname);
-
-// split text into tokens
-//
-// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
-//
-// Regex (Python):
-// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
-//
-// Regex (C++):
-// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
-//
-std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
-
-// load the tokens from encoder.json
-bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
-
-// sample next token given probabilities for each embedding
-//
-//   - consider only the top K tokens
-//   - from them, consider only the top tokens with cumulative probability > P
-//
-// TODO: not sure if this implementation is correct
-//
-gpt_vocab::id gpt_sample_top_k_top_p(
-        const size_t actualVocabSize,
-        const int32_t * last_n_tokens_data,
-        int   last_n_tokens_size,
-        const std::vector<float> logits,
-        int    top_k,
-        double top_p,
-        double temp,
-        float repeat_penalty,
-        std::mt19937 & rng);
+#ifdef NDEBUG
+#   ifdef __has_builtin
+#       if __has_builtin(__builtin_unreachable)
+#           define UNREACHABLE() __builtin_unreachable()
+#       else
+#           define UNREACHABLE() do {} while (0)
+#       endif
+#   else
+#       define UNREACHABLE() do {} while (0)
+#   endif
+#else
+#   define UNREACHABLE() assert(!"Unreachable statement was reached")
+#endif
--- a/gpt4all-bindings/python/CHANGELOG.md
+++ b/gpt4all-bindings/python/CHANGELOG.md
@ -9,6 +9,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 ### Added
 - Warn on Windows if the Microsoft Visual C++ runtime libraries are not found ([#2920](https://github.com/nomic-ai/gpt4all/pull/2920))

+### Changed
+- Rebase llama.cpp on latest upstream as of September 26th ([#2998](https://github.com/nomic-ai/gpt4all/pull/2998))
+
 ## [2.8.2] - 2024-08-14

 ### Fixed
--- a/gpt4all-chat/CHANGELOG.md
+++ b/gpt4all-chat/CHANGELOG.md
@ -9,6 +9,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 ### Added
 - Add bm25 hybrid search to localdocs ([#2969](https://github.com/nomic-ai/gpt4all/pull/2969))

+### Changed
+- Rebase llama.cpp on latest upstream as of September 26th ([#2998](https://github.com/nomic-ai/gpt4all/pull/2998))
+
 ### Fixed
 - Fix a crash when attempting to continue a chat loaded from disk ([#2995](https://github.com/nomic-ai/gpt4all/pull/2995))
 - Fix the local server rejecting min\_p/top\_p less than 1 ([#2996](https://github.com/nomic-ai/gpt4all/pull/2996))
--- a/gpt4all-chat/src/chatapi.cpp
+++ b/gpt4all-chat/src/chatapi.cpp
@ -71,19 +71,19 @@ bool ChatAPI::isModelLoaded() const
 // All three of the state virtual functions are handled custom inside of chatllm save/restore
 size_t ChatAPI::stateSize() const
 {
-    return 0;
+    throw std::logic_error("not implemented");
 }

-size_t ChatAPI::saveState(uint8_t *dest) const
+size_t ChatAPI::saveState(std::span<uint8_t> dest) const
 {
    Q_UNUSED(dest);
-    return 0;
+    throw std::logic_error("not implemented");
 }

-size_t ChatAPI::restoreState(const uint8_t *src)
+size_t ChatAPI::restoreState(std::span<const uint8_t> src)
 {
    Q_UNUSED(src);
-    return 0;
+    throw std::logic_error("not implemented");
 }

 void ChatAPI::prompt(const std::string &prompt,
--- a/gpt4all-chat/src/chatapi.h
+++ b/gpt4all-chat/src/chatapi.h
@ -64,8 +64,8 @@ public:
    bool isModelLoaded() const override;
    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
    size_t stateSize() const override;
-    size_t saveState(uint8_t *dest) const override;
-    size_t restoreState(const uint8_t *src) override;
+    size_t saveState(std::span<uint8_t> dest) const override;
+    size_t restoreState(std::span<const uint8_t> src) override;
    void prompt(const std::string &prompt,
                const std::string &promptTemplate,
                std::function<bool(int32_t)> promptCallback,
@ -118,12 +118,14 @@ protected:
        throw std::logic_error("not implemented");
    }

-    Token sampleToken(PromptContext &ctx) const override
+    void initSampler(PromptContext &ctx) override
    {
        (void)ctx;
        throw std::logic_error("not implemented");
    }

+    Token sampleToken() const override { throw std::logic_error("not implemented"); }
+
    bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override
    {
        (void)ctx;
--- a/gpt4all-chat/src/chatllm.cpp
+++ b/gpt4all-chat/src/chatllm.cpp
@ -1174,7 +1174,13 @@ void ChatLLM::saveState()
 #if defined(DEBUG)
    qDebug() << "saveState" << m_llmThread.objectName() << "size:" << m_state.size();
 #endif
-    m_llModelInfo.model->saveState(static_cast<uint8_t*>(reinterpret_cast<void*>(m_state.data())));
+    bool ok = m_llModelInfo.model->saveState({reinterpret_cast<uint8_t *>(m_state.data()), size_t(m_state.size())});
+    if (!ok) {
+        // FIXME(jared): how badly does this situation break GPT4All?
+        qWarning() << "ChatLLM failed to save LLModel state";
+        m_state.clear();
+        m_state.squeeze();
+    }
 }

 void ChatLLM::restoreState()
@ -1183,7 +1189,7 @@ void ChatLLM::restoreState()
        return;

    if (m_llModelType == LLModelType::API_) {
-        QDataStream stream(&m_state, QIODeviceBase::ReadOnly);
+        QDataStream stream(m_state);
        stream.setVersion(QDataStream::Qt_6_4);
        ChatAPI *chatAPI = static_cast<ChatAPI*>(m_llModelInfo.model.get());
        QList<QString> context;
@ -1201,12 +1207,12 @@ void ChatLLM::restoreState()
    if (m_state.isEmpty())
        return;

-    if (m_llModelInfo.model->stateSize() == m_state.size()) {
-        m_llModelInfo.model->restoreState(static_cast<const uint8_t*>(reinterpret_cast<void*>(m_state.data())));
+    size_t bytesRead = m_llModelInfo.model->restoreState({reinterpret_cast<uint8_t *>(m_state.data()), size_t(m_state.size())});
+    if (bytesRead) {
        m_processedSystemPrompt = true;
        m_pristineLoadedState = true;
    } else {
-        qWarning() << "restoring state from text because" << m_llModelInfo.model->stateSize() << "!=" << m_state.size();
+        qWarning() << "restoring state from text because of error reading state (mismatch or corrupt data)";
        m_restoreStateFromText = true;
    }