Merge branch 'main' into thilotee-addmodel4

Signed-off-by: ThiloteE <73715071+ThiloteE@users.noreply.github.com>
2024-10-01 01:06:10 -04:00 · 2024-09-11 01:16:15 +02:00 · 2024-09-11 01:16:15 +02:00 · 3fda807487
commit 3fda807487
parent f9a154281a 3ef582f272
128 changed files with 9219 additions and 11571 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -16,4 +16,3 @@ workflows:
            gpt4all-bindings/python/.* run-python-workflow true
            gpt4all-bindings/typescript/.* run-ts-workflow true
            gpt4all-chat/.* run-chat-workflow true
            .* run-default-workflow true
--- a/.circleci/continue_config.yml
+++ b/.circleci/continue_config.yml
--- a/.gitignore
+++ b/.gitignore
@ -181,6 +181,7 @@ CMakeLists.txt.user
 gpt4all-chat/models/*
 build_*
 build-*
 cmake-build-*
 # IntelliJ
 .idea/
--- a/.gitmodules
+++ b/.gitmodules
@ -1,7 +1,13 @@
 [submodule "llama.cpp-mainline"]
-	path = gpt4all-backend/llama.cpp-mainline
+	path = gpt4all-backend/deps/llama.cpp-mainline
 	url = https://github.com/nomic-ai/llama.cpp.git
 	branch = master
 [submodule "gpt4all-chat/usearch"]
-	path = gpt4all-chat/usearch
+	path = gpt4all-chat/deps/usearch
 	url = https://github.com/nomic-ai/usearch.git
 [submodule "gpt4all-chat/deps/SingleApplication"]
 	path = gpt4all-chat/deps/SingleApplication
 	url = https://github.com/nomic-ai/SingleApplication.git
 [submodule "gpt4all-chat/deps/fmt"]
 	path = gpt4all-chat/deps/fmt
 	url = https://github.com/fmtlib/fmt.git
--- a/README.md
+++ b/README.md
@ -1,43 +1,25 @@
 <h1 align="center">GPT4All</h1>
 <p align="center">GPT4All runs large language models (LLMs) privately on everyday desktops & laptops. <br> <br> No API calls or GPUs required - you can just download the application and <a href="https://docs.gpt4all.io/gpt4all_desktop/quickstart.html#quickstart">get started</a>
 https://github.com/nomic-ai/gpt4all/assets/70534565/513a0f15-4964-4109-89e4-4f9a9011f311
 <p align="center">
-  <a href="https://gpt4all.io/installers/gpt4all-installer-win64.exe">
+  <a href="https://www.nomic.ai/gpt4all">Website</a> &bull; <a href="https://docs.gpt4all.io">Documentation</a> &bull; <a href="https://discord.gg/mGZE39AS3e">Discord</a>
    <img src="gpt4all-bindings/python/docs/assets/windows.png" width="80" height="80"><br>
    Download for Windows
  </a>
 </p>
 <p align="center">
-  <a href="https://gpt4all.io/installers/gpt4all-installer-darwin.dmg">
+  GPT4All runs large language models (LLMs) privately on everyday desktops & laptops.
-    <img src="gpt4all-bindings/python/docs/assets/mac.png" width="85" height="100"><br>
+</p>
-    Download for MacOS
+<p align="center">
-  </a>
+  No API calls or GPUs required - you can just download the application and <a href="https://docs.gpt4all.io/gpt4all_desktop/quickstart.html#quickstart">get started</a>.
 </p>
 <p align="center">
-  <a href="https://gpt4all.io/installers/gpt4all-installer-linux.run">
+  Read about what's new in <a href="https://www.nomic.ai/blog/tag/gpt4all">our blog</a>.
    <img src="gpt4all-bindings/python/docs/assets/ubuntu.svg" width="120" height="120"><br>
    Download for Ubuntu
  </a>
 </p>
 <p align="center">
  <a href='https://flathub.org/apps/io.gpt4all.gpt4all'>
    <img width='240' alt='Get it on Flathub' src='https://flathub.org/api/badge?locale=en'><br>
    Get it on Flathub (community maintained)
  </a>
 </p>
 <p align="center">
  <a href="https://gpt4all.io">Website</a> &bull; <a href="https://docs.gpt4all.io">Documentation</a> &bull; <a href="https://discord.gg/mGZE39AS3e">Discord</a>
 </p>
 <p align="center">
  <a href="https://forms.nomic.ai/gpt4all-release-notes-signup">Subscribe to the newsletter</a>
 </p>
 https://github.com/nomic-ai/gpt4all/assets/70534565/513a0f15-4964-4109-89e4-4f9a9011f311
 <p align="center">
 GPT4All is made possible by our compute partner <a href="https://www.paperspace.com/">Paperspace</a>.
 </p>
@ -45,6 +27,38 @@ GPT4All is made possible by our compute partner <a href="https://www.paperspace.
 <a href="https://www.phorm.ai/query?projectId=755eecd3-24ad-49cc-abf4-0ab84caacf63"><img src="https://img.shields.io/badge/Phorm-Ask_AI-%23F2777A.svg" alt="phorm.ai"></a>
 </p>
 ## Download Links
 <p>
  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-win64.exe">
    <img src="gpt4all-bindings/python/docs/assets/windows.png" style="height: 1em; width: auto" /> Windows Installer
  </a> &mdash;
 </p>
 <p>
  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-darwin.dmg">
    <img src="gpt4all-bindings/python/docs/assets/mac.png" style="height: 1em; width: auto" /> macOS Installer
  </a> &mdash;
 </p>
 <p>
  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-linux.run">
    <img src="gpt4all-bindings/python/docs/assets/ubuntu.svg" style="height: 1em; width: auto" /> Ubuntu Installer
  </a> &mdash;
 </p>
 <p>
  Windows and Linux require Intel Core i3 2nd Gen / AMD Bulldozer, or better. x86-64 only, no ARM.
 </p>
 <p>
  macOS requires Monterey 12.6 or newer. Best results with Apple Silicon M-series processors.
 </p>
 <br/>
 <br/>
 <p>
  <a href='https://flathub.org/apps/io.gpt4all.gpt4all'>
    <img style="height: 2em; width: auto" alt='Get it on Flathub' src='https://flathub.org/api/badge'><br/>
    Flathub (community maintained)
  </a>
 </p>
 ## Install GPT4All Python
 `gpt4all` gives you access to LLMs with our Python client around [`llama.cpp`](https://github.com/ggerganov/llama.cpp) implementations. 
@ -75,7 +89,7 @@ with model.chat_session():
    - Improved user workflow for LocalDocs
    - Expanded access to more model architectures
 - **October 19th, 2023**: GGUF Support Launches with Support for:
-    - Mistral 7b base model, an updated model gallery on [gpt4all.io](https://gpt4all.io), several new local code models including Rift Coder v1.5
+    - Mistral 7b base model, an updated model gallery on our website, several new local code models including Rift Coder v1.5
    - [Nomic Vulkan](https://blog.nomic.ai/posts/gpt4all-gpu-inference-with-vulkan) support for Q4\_0 and Q4\_1 quantizations in GGUF.
    - Offline build support for running old versions of the GPT4All Local LLM Chat Client.
 - **September 18th, 2023**: [Nomic Vulkan](https://blog.nomic.ai/posts/gpt4all-gpu-inference-with-vulkan) launches supporting local LLM inference on NVIDIA and AMD GPUs.
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.21)  # for PROJECT_IS_TOP_LEVEL
+cmake_minimum_required(VERSION 3.23)  # for FILE_SET
 set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@ -33,7 +33,7 @@ set(LLMODEL_VERSION_PATCH 0)
 set(LLMODEL_VERSION "${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL_VERSION_PATCH}")
 project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C)
-set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD 23)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
 set(BUILD_SHARED_LIBS ON)
@ -47,7 +47,7 @@ else()
    message(STATUS "Interprocedural optimization support detected")
 endif()
-set(DIRECTORY llama.cpp-mainline)
+set(DIRECTORY deps/llama.cpp-mainline)
 include(llama.cpp.cmake)
 set(BUILD_VARIANTS)
@ -63,9 +63,23 @@ if (LLMODEL_VULKAN)
    list(APPEND BUILD_VARIANTS vulkan vulkan-avxonly)
 endif()
 if (LLMODEL_CUDA)
-    if (DEFINED CMAKE_CUDA_ARCHITECTURES)
+    cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
-        set(GGML_CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}")
+
    # Defaults must be set before enable_language(CUDA).
    # Keep this in sync with the arch list in ggml/src/CMakeLists.txt.
    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
        # 52 == lowest CUDA 12 standard
        # 60 == f16 CUDA intrinsics
        # 61 == integer CUDA intrinsics
        # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
        if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
            set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics
        else()
            set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
            #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
        endif()
    endif()
    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
    include(CheckLanguage)
    check_language(CUDA)
@ -132,9 +146,12 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
    # Add each individual implementations
    add_library(llamamodel-mainline-${BUILD_VARIANT} SHARED
-        llamamodel.cpp llmodel_shared.cpp)
+        src/llamamodel.cpp src/llmodel_shared.cpp)
    target_compile_definitions(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
        LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
    target_include_directories(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
        src include/gpt4all-backend
    )
    prepare_target(llamamodel-mainline llama-mainline)
    if (NOT PROJECT_IS_TOP_LEVEL AND BUILD_VARIANT STREQUAL cuda)
@ -143,11 +160,19 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
 endforeach()
 add_library(llmodel
-    llmodel.h llmodel.cpp llmodel_shared.cpp
+    src/dlhandle.cpp
-    llmodel_c.h llmodel_c.cpp
+    src/llmodel.cpp
-    dlhandle.cpp
+    src/llmodel_c.cpp
    src/llmodel_shared.cpp
 )
 target_sources(llmodel PUBLIC
    FILE_SET public_headers TYPE HEADERS BASE_DIRS include
    FILES include/gpt4all-backend/llmodel.h
          include/gpt4all-backend/llmodel_c.h
          include/gpt4all-backend/sysinfo.h
 )
 target_compile_definitions(llmodel PRIVATE LIB_FILE_EXT="${CMAKE_SHARED_LIBRARY_SUFFIX}")
 target_include_directories(llmodel PRIVATE src include/gpt4all-backend)
 set_target_properties(llmodel PROPERTIES
                              VERSION ${PROJECT_VERSION}
--- a/gpt4all-backend/deps/llama.cpp-mainline
+++ b/gpt4all-backend/deps/llama.cpp-mainline
@ -0,0 +1 @@
 Subproject commit ced74fbad4b258507f3ec06e77eec9445583511a
--- a/gpt4all-backend/include/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/include/gpt4all-backend/llmodel.h
@ -134,7 +134,7 @@ public:
        int32_t n_batch = 9;
        float   repeat_penalty = 1.10f;
        int32_t repeat_last_n = 64;     // last n tokens to penalize
-        float   contextErase = 0.75f;   // percent of context to erase if we exceed the context window
+        float   contextErase = 0.5f;    // percent of context to erase if we exceed the context window
    };
    using ProgressCallback = std::function<bool(float progress)>;
@ -159,10 +159,10 @@ public:
                        const std::string &promptTemplate,
                        std::function<bool(int32_t)> promptCallback,
                        std::function<bool(int32_t, const std::string&)> responseCallback,
-                        std::function<bool(bool)> recalculateCallback,
+                        bool allowContextShift,
                        PromptContext &ctx,
                        bool special = false,
-                        std::string *fakeReply = nullptr);
+                        std::optional<std::string_view> fakeReply = {});
    using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend);
@ -212,10 +212,12 @@ public:
 protected:
    // These are pure virtual because subclasses need to implement as the default implementation of
    // 'prompt' above calls these functions
-    virtual std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special = false) = 0;
+    virtual std::vector<Token> tokenize(PromptContext &ctx, std::string_view str, bool special = false) = 0;
    virtual bool isSpecialToken(Token id) const = 0;
    virtual std::string tokenToString(Token id) const = 0;
    virtual Token sampleToken(PromptContext &ctx) const = 0;
    virtual bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const = 0;
    virtual void shiftContext(PromptContext &promptCtx) = 0;
    virtual int32_t contextLength() const = 0;
    virtual const std::vector<Token> &endTokens() const = 0;
    virtual bool shouldAddBOS() const = 0;
@ -232,10 +234,6 @@ protected:
        return -1;
    }
    // This is a helper function called from the default implementation of 'prompt' but it can be
    // shared by all base classes so it isn't virtual
    void recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate);
    const Implementation *m_implementation = nullptr;
    ProgressCallback m_progressCallback;
@ -249,11 +247,12 @@ protected:
    bool decodePrompt(std::function<bool(int32_t)> promptCallback,
                      std::function<bool(int32_t, const std::string&)> responseCallback,
-                      std::function<bool(bool)> recalculateCallback,
+                      bool allowContextShift,
                      PromptContext &promptCtx,
-                      std::vector<Token> embd_inp);
+                      std::vector<Token> embd_inp,
                      bool isResponse = false);
    void generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
-                          std::function<bool(bool)> recalculateCallback,
+                          bool allowContextShift,
                          PromptContext &promptCtx);
    Token m_tokenize_last_token = -1; // not serialized
--- a/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
@ -74,13 +74,6 @@ typedef bool (*llmodel_prompt_callback)(int32_t token_id);
 */
 typedef bool (*llmodel_response_callback)(int32_t token_id, const char *response);
 /**
 * Callback type for recalculation of context.
 * @param whether the model is recalculating the context.
 * @return a bool indicating whether the model should keep generating.
 */
 typedef bool (*llmodel_recalculate_callback)(bool is_recalculating);
 /**
 * Embedding cancellation callback for use with llmodel_embed.
 * @param batch_sizes The number of tokens in each batch that will be embedded.
@ -175,7 +168,7 @@ uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src);
 * @param prompt_template A string representing the input prompt template.
 * @param prompt_callback A callback function for handling the processing of prompt.
 * @param response_callback A callback function for handling the generated response.
- * @param recalculate_callback A callback function for handling recalculation requests.
+ * @param allow_context_shift Whether to allow shifting of context to make room for more input.
 * @param special True if special tokens in the prompt should be processed, false otherwise.
 * @param fake_reply A string to insert into context as the model's reply, or NULL to generate one.
 * @param ctx A pointer to the llmodel_prompt_context structure.
@ -184,7 +177,7 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
                    const char *prompt_template,
                    llmodel_prompt_callback prompt_callback,
                    llmodel_response_callback response_callback,
-                    llmodel_recalculate_callback recalculate_callback,
+                    bool allow_context_shift,
                    llmodel_prompt_context *ctx,
                    bool special,
                    const char *fake_reply);
--- a/gpt4all-backend/include/gpt4all-backend/sysinfo.h
+++ b/gpt4all-backend/include/gpt4all-backend/sysinfo.h
--- a/gpt4all-backend/llama.cpp-mainline
+++ b/gpt4all-backend/llama.cpp-mainline
@ -1 +0,0 @@
 Subproject commit add387854ea73d83770a62282089dea666fa266f
--- a/gpt4all-backend/llama.cpp.cmake
+++ b/gpt4all-backend/llama.cpp.cmake
@ -378,19 +378,7 @@ function(include_ggml SUFFIX)
        find_package(CUDAToolkit REQUIRED)
        set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
-        if (NOT DEFINED GGML_CUDA_ARCHITECTURES)
+        # architectures are set in gpt4all-backend/CMakeLists.txt
            # 52 == lowest CUDA 12 standard
            # 60 == f16 CUDA intrinsics
            # 61 == integer CUDA intrinsics
            # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
            if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
                set(GGML_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics
            else()
                set(GGML_CUDA_ARCHITECTURES "52;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
                #set(GGML_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
            endif()
        endif()
        message(STATUS "Using CUDA architectures: ${GGML_CUDA_ARCHITECTURES}")
        set(GGML_HEADERS_CUDA ${DIRECTORY}/ggml/include/ggml-cuda.h)
        file(GLOB   GGML_HEADERS_CUDA "${DIRECTORY}/ggml/src/ggml-cuda/*.cuh")
@ -823,7 +811,8 @@ function(include_ggml SUFFIX)
            list(APPEND XC_FLAGS -std=${GGML_METAL_STD})
        endif()
-        set(GGML_METALLIB ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib)
+        set(GGML_METALLIB "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib")
        set(GGML_METALLIB "${GGML_METALLIB}" PARENT_SCOPE)
        add_custom_command(
            OUTPUT ${GGML_METALLIB}
            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
@ -834,7 +823,6 @@ function(include_ggml SUFFIX)
            DEPENDS ${DIRECTORY}/ggml/src/ggml-metal.metal ${DIRECTORY}/ggml/src/ggml-common.h
            COMMENT "Compiling Metal kernels"
            )
        set_source_files_properties(${GGML_METALLIB} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES GENERATED ON)
        add_custom_target(
            ggml-metal ALL
@ -1018,9 +1006,6 @@ function(include_ggml SUFFIX)
        C_STANDARD 11
        C_STANDARD_REQUIRED true
        )
    if (GGML_CUDA_ARCHITECTURES)
        set_property(TARGET ggml${SUFFIX} llama${SUFFIX} PROPERTY CUDA_ARCHITECTURES "${GGML_CUDA_ARCHITECTURES}")
    endif()
    target_compile_options(ggml${SUFFIX} PRIVATE "${GGML_COMPILE_OPTS}")
    target_compile_options(llama${SUFFIX} PRIVATE "${GGML_COMPILE_OPTS}")
--- a/gpt4all-backend/llmodel_shared.cpp
+++ b/gpt4all-backend/llmodel_shared.cpp
@ -1,322 +0,0 @@
 #include "llmodel.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <iostream>
 #include <optional>
 #include <regex>
 #include <sstream>
 #include <stdexcept>
 #include <string>
 #include <unordered_set>
 #include <vector>
 // TODO(cebtenzzre): replace this with llama_kv_cache_seq_shift for llamamodel (GPT-J needs this as-is)
 // FIXME(jared): if recalculate returns false, we leave n_past<tokens.size() and do not tell the caller to stop
 // FIXME(jared): if we get here during chat name or follow-up generation, bad things will happen when we try to restore
 // the old prompt context afterwards
 void LLModel::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate)
 {
    int n_keep = shouldAddBOS();
    const int32_t n_discard = (promptCtx.n_ctx - n_keep) * promptCtx.contextErase;
    // Erase the first percentage of context from the tokens
    std::cerr << implementation().modelType() << ": reached the end of the context window so resizing\n";
    promptCtx.tokens.erase(promptCtx.tokens.begin() + n_keep, promptCtx.tokens.begin() + n_keep + n_discard);
    size_t i = n_keep;
    promptCtx.n_past = n_keep;
    while (i < promptCtx.tokens.size()) {
        size_t batch_end = std::min(i + promptCtx.n_batch, promptCtx.tokens.size());
        std::vector<int32_t> batch(promptCtx.tokens.begin() + i, promptCtx.tokens.begin() + batch_end);
        assert(promptCtx.n_past + int32_t(batch.size()) <= promptCtx.n_ctx);
        if (!evalTokens(promptCtx, batch)) {
            std::cerr << "LLModel ERROR: Failed to process prompt\n";
            goto stop_generating;
        }
        promptCtx.n_past += batch.size();
        if (!recalculate(true))
            goto stop_generating;
        i = batch_end;
    }
    assert(promptCtx.n_past == int32_t(promptCtx.tokens.size()));
 stop_generating:
    recalculate(false);
 }
 static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch> &placeholders, std::string &err)
 {
    static const std::regex placeholderRegex(R"(%[1-2](?![0-9]))");
    auto it = std::sregex_iterator(tmpl.begin(), tmpl.end(), placeholderRegex);
    placeholders.clear();
    placeholders.insert(placeholders.end(), it, std::sregex_iterator());
    if (placeholders.size() > 2) {
        err = "ERROR: expected at most two placeholders, got " + std::to_string(placeholders.size());
        return false;
    }
    if (placeholders.size() >= 1 && placeholders[0].str() != "%1") {
        err = "ERROR: first placeholder must be %1, got " + placeholders[0].str();
        return false;
    }
    if (placeholders.size() >= 2 && placeholders[1].str() != "%2") {
        err = "ERROR: second placeholder must be %2, got " + placeholders[1].str();
        return false;
    }
    return true;
 }
 void LLModel::prompt(const std::string &prompt,
                     const std::string &promptTemplate,
                     std::function<bool(int32_t)> promptCallback,
                     std::function<bool(int32_t, const std::string&)> responseCallback,
                     std::function<bool(bool)> recalculateCallback,
                     PromptContext &promptCtx,
                     bool special,
                     std::string *fakeReply)
 {
    if (!isModelLoaded()) {
        std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
        return;
    }
    if (!supportsCompletion()) {
        std::string errorMessage = "ERROR: this model does not support text completion or chat!";
        responseCallback(-1, errorMessage);
        std::cerr << implementation().modelType() << " " << errorMessage << "\n";
        return;
    }
    // make sure token cache matches decode offset
    if (promptCtx.tokens.size() < promptCtx.n_past) {
        std::ostringstream ss;
        ss << "expected n_past to be at most " << promptCtx.tokens.size() << ", got " << promptCtx.n_past;
        throw std::out_of_range(ss.str());
    }
    if (promptCtx.n_past < promptCtx.tokens.size())
        promptCtx.tokens.resize(promptCtx.n_past);
    m_tokenize_last_token = promptCtx.tokens.empty() ? -1 : promptCtx.tokens.back(); // not serialized
    // parse the prompt template
    std::vector<std::smatch> placeholders;
    {
        std::string err;
        if (!parsePromptTemplate(promptTemplate, placeholders, err)) {
            responseCallback(-1, err);
            std::cerr << err << "\n";
            return;
        }
    }
    auto old_n_past = promptCtx.n_past; // prepare to fake n_past for tokenize
    // tokenize the user prompt
    std::vector<Token> embd_inp;
    if (placeholders.empty()) {
        // this is unusual, but well-defined
        std::cerr << __func__ << ": prompt template has no placeholder\n";
        embd_inp = tokenize(promptCtx, promptTemplate, true);
    } else {
        // template: beginning of user prompt
        const auto &phUser = placeholders[0];
        std::string userPrefix(phUser.prefix());
        if (!userPrefix.empty()) {
            embd_inp = tokenize(promptCtx, userPrefix, true);
            promptCtx.n_past += embd_inp.size();
        }
        // user input (shouldn't have special token processing)
        auto tokens = tokenize(promptCtx, prompt, special);
        embd_inp.insert(embd_inp.end(), tokens.begin(), tokens.end());
        promptCtx.n_past += tokens.size();
        // template: end of user prompt + start of assistant prompt
        size_t start = phUser.position() + phUser.length();
        size_t end = placeholders.size() >= 2 ? placeholders[1].position() : promptTemplate.length();
        auto userToAsst = promptTemplate.substr(start, end - start);
        if (!userToAsst.empty()) {
            tokens = tokenize(promptCtx, userToAsst, true);
            embd_inp.insert(embd_inp.end(), tokens.begin(), tokens.end());
            promptCtx.n_past += tokens.size();
        }
    }
    promptCtx.n_past = old_n_past; // restore n_past so decodePrompt can increment it
    // decode the user prompt
    if (!decodePrompt(promptCallback, responseCallback, recalculateCallback, promptCtx, embd_inp))
        return; // error
    // decode the assistant's reply, either generated or spoofed
    if (fakeReply == nullptr) {
        generateResponse(responseCallback, recalculateCallback, promptCtx);
    } else {
        embd_inp = tokenize(promptCtx, *fakeReply, false);
        if (!decodePrompt(promptCallback, responseCallback, recalculateCallback, promptCtx, embd_inp))
            return; // error
    }
    // decode the rest of the prompt template
    // template: end of assistant prompt
    std::string asstSuffix;
    if (placeholders.size() >= 2) {
        size_t start = placeholders[1].position() + placeholders[1].length();
        asstSuffix = promptTemplate.substr(start);
    } else {
        asstSuffix = "\n\n"; // default to a blank link, good for e.g. Alpaca
    }
    if (!asstSuffix.empty()) {
        embd_inp = tokenize(promptCtx, asstSuffix, true);
        decodePrompt(promptCallback, responseCallback, recalculateCallback, promptCtx, embd_inp);
    }
 }
 // returns false on error
 bool LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
                           std::function<bool(int32_t, const std::string&)> responseCallback,
                           std::function<bool(bool)> recalculateCallback,
                           PromptContext &promptCtx,
                           std::vector<Token> embd_inp) {
    // save the context size
    promptCtx.n_ctx = contextLength();
    if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
        responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
        std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
            " tokens and the context window is " << promptCtx.n_ctx << "!\n";
        return false;
    }
    promptCtx.n_predict = std::min(promptCtx.n_predict, promptCtx.n_ctx - (int) embd_inp.size());
    promptCtx.n_past = std::min(promptCtx.n_past, promptCtx.n_ctx);
    promptCtx.n_batch = std::min(promptCtx.n_batch, LLMODEL_MAX_PROMPT_BATCH);
    // process the prompt in batches
    size_t i = 0;
    while (i < embd_inp.size()) {
        size_t batch_end = std::min(i + promptCtx.n_batch, embd_inp.size());
        std::vector<Token> batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);
        // Check if the context has run out...
        if (promptCtx.n_past + int32_t(batch.size()) > promptCtx.n_ctx) {
            recalculateContext(promptCtx, recalculateCallback);
            assert(promptCtx.n_past + int32_t(batch.size()) <= promptCtx.n_ctx);
        }
        if (!evalTokens(promptCtx, batch)) {
            std::cerr << implementation().modelType() << " ERROR: Failed to process prompt\n";
            return false;
        }
        size_t tokens = batch_end - i;
        for (size_t t = 0; t < tokens; ++t) {
            promptCtx.tokens.push_back(batch.at(t));
            promptCtx.n_past += 1;
            if (!promptCallback(batch.at(t)))
                return false;
        }
        i = batch_end;
    }
    return true;
 }
 void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
                               std::function<bool(bool)> recalculateCallback,
                               PromptContext &promptCtx) {
    std::string cachedResponse;
    std::vector<Token> cachedTokens;
    std::unordered_set<std::string> reversePrompts
        = { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context" };
    // predict next tokens
    for (int i = 0; i < promptCtx.n_predict; i++) {
        // sample next token
        auto id = sampleToken(promptCtx);
        // Check if the context has run out...
        if (promptCtx.n_past + 1 > promptCtx.n_ctx) {
            recalculateContext(promptCtx, recalculateCallback);
            assert(promptCtx.n_past + 1 <= promptCtx.n_ctx);
        }
        if (!evalTokens(promptCtx, { id })) {
            std::cerr << implementation().modelType() << " ERROR: Failed to predict next token\n";
            return;
        }
        // display text
        for (const auto token : endTokens()) {
            if (id == token) return;
        }
        const std::string str = tokenToString(id);
        // Check if the provided str is part of our reverse prompts
        bool foundPartialReversePrompt = false;
        const std::string completed = cachedResponse + std::string(str);
        if (reversePrompts.find(completed) != reversePrompts.end())
            return;
        // Check if it partially matches our reverse prompts and if so, cache
        for (const auto& s : reversePrompts) {
            if (s.compare(0, completed.size(), completed) == 0) {
                foundPartialReversePrompt = true;
                cachedResponse = completed;
                break;
            }
        }
        // Regardless the token gets added to our cache
        cachedTokens.push_back(id);
        // Continue if we have found a partial match
        if (foundPartialReversePrompt)
            continue;
        // Empty the cache
        for (auto t : cachedTokens) {
            promptCtx.tokens.push_back(t);
            promptCtx.n_past += 1;
            //TODO: Conversion to std::string can be avoided here...
            if (!responseCallback(t, std::string(tokenToString(t))))
                return;
        }
        cachedTokens.clear();
    }
 }
 void LLModel::embed(
    const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
    size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb
 ) {
    (void)texts;
    (void)embeddings;
    (void)prefix;
    (void)dimensionality;
    (void)tokenCount;
    (void)doMean;
    (void)atlas;
    (void)cancelCb;
    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
 }
 void LLModel::embed(
    const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
    bool doMean, bool atlas
 ) {
    (void)texts;
    (void)embeddings;
    (void)isRetrieval;
    (void)dimensionality;
    (void)tokenCount;
    (void)doMean;
    (void)atlas;
    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
 }
--- a/gpt4all-backend/src/dlhandle.cpp
+++ b/gpt4all-backend/src/dlhandle.cpp
--- a/gpt4all-backend/src/dlhandle.h
+++ b/gpt4all-backend/src/dlhandle.h
--- a/gpt4all-backend/src/llamamodel.cpp
+++ b/gpt4all-backend/src/llamamodel.cpp
@ -137,7 +137,7 @@ struct gpt_params {
    bool use_mlock         = false; // use mlock to keep model in memory
 };
-static int llama_sample_top_p_top_k(
+static llama_token llama_sample_top_p_top_k(
        llama_context *ctx,
        const llama_token *last_n_tokens_data,
        int last_n_tokens_size,
@ -157,14 +157,22 @@ static int llama_sample_top_p_top_k(
    llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
    // Sample repeat penalty
    llama_sample_repetition_penalties(nullptr, &candidates_p, last_n_tokens_data, last_n_tokens_size, repeat_penalty, 0.0f, 0.0f);
-    // Temperature sampling
+
    llama_token id;
    if (temp == 0.0) {
        // greedy sampling, no probs
        id = llama_sample_token_greedy(ctx, &candidates_p);
    } else {
        // temperature sampling
        llama_sample_top_k(ctx, &candidates_p, top_k, 1);
        llama_sample_tail_free(ctx, &candidates_p, 1.0f, 1);
        llama_sample_typical(ctx, &candidates_p, 1.0f, 1);
        llama_sample_top_p(ctx, &candidates_p, top_p, 1);
        llama_sample_min_p(ctx, &candidates_p, min_p, 1);
        llama_sample_temp(ctx, &candidates_p, temp);
-    return llama_sample_token(ctx, &candidates_p);
+        id = llama_sample_token(ctx, &candidates_p);
    }
    return id;
 }
 const char *get_arch_name(gguf_context *ctx_gguf)
@ -528,16 +536,13 @@ size_t LLamaModel::restoreState(const uint8_t *src)
    return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
 }
-std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str, bool special)
+std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, std::string_view str, bool special)
 {
    bool atStart = m_tokenize_last_token == -1;
-    bool insertSpace = atStart || (
+    bool insertSpace = atStart || isSpecialToken(m_tokenize_last_token);
        llama_token_get_attr(d_ptr->model, m_tokenize_last_token)
        & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)
    );
    std::vector<LLModel::Token> fres(str.length() + 4);
    int32_t fres_len = llama_tokenize_gpt4all(
-        d_ptr->model, str.c_str(), str.length(), fres.data(), fres.size(), /*add_special*/ atStart,
+        d_ptr->model, str.data(), str.length(), fres.data(), fres.size(), /*add_special*/ atStart,
        /*parse_special*/ special, /*insert_space*/ insertSpace
    );
    fres.resize(fres_len);
@ -546,6 +551,12 @@ std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::
    return fres;
 }
 bool LLamaModel::isSpecialToken(Token id) const
 {
    return llama_token_get_attr(d_ptr->model, id)
        & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN);
 }
 std::string LLamaModel::tokenToString(Token id) const
 {
    std::vector<char> result(8, 0);
@ -595,6 +606,30 @@ bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &toke
    return res == 0;
 }
 void LLamaModel::shiftContext(PromptContext &promptCtx)
 {
    // infinite text generation via context shifting
    // erase up to n_ctx*contextErase tokens
    int n_keep = shouldAddBOS();
    int n_past = promptCtx.n_past;
    int n_discard = std::min(n_past - n_keep, int(promptCtx.n_ctx * promptCtx.contextErase));
    assert(n_discard > 0);
    if (n_discard <= 0)
        return;
    std::cerr << "Llama: context full, swapping: n_past = " << n_past << ", n_keep = " << n_keep
              << ", n_discard = " << n_discard << "\n";
    // erase the first n_discard tokens from the context
    llama_kv_cache_seq_rm (d_ptr->ctx, 0, n_keep,             n_keep + n_discard);
    llama_kv_cache_seq_add(d_ptr->ctx, 0, n_keep + n_discard, n_past,             -n_discard);
    promptCtx.tokens.erase(promptCtx.tokens.begin() + n_keep, promptCtx.tokens.begin() + n_keep + n_discard);
    promptCtx.n_past = promptCtx.tokens.size();
 }
 int32_t LLamaModel::contextLength() const
 {
    return llama_n_ctx(d_ptr->ctx);
--- a/gpt4all-backend/src/llamamodel_impl.h
+++ b/gpt4all-backend/src/llamamodel_impl.h
@ -6,9 +6,9 @@
 #include "llmodel.h"
 #include <functional>
 #include <memory>
 #include <string>
 #include <string_view>
 #include <vector>
 struct LLamaPrivate;
@ -53,10 +53,12 @@ private:
    bool m_supportsCompletion = false;
 protected:
-    std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) override;
+    std::vector<Token> tokenize(PromptContext &ctx, std::string_view str, bool special) override;
    bool isSpecialToken(Token id) const override;
    std::string tokenToString(Token id) const override;
    Token sampleToken(PromptContext &ctx) const override;
    bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
    void shiftContext(PromptContext &promptCtx) override;
    int32_t contextLength() const override;
    const std::vector<Token> &endTokens() const override;
    bool shouldAddBOS() const override;
--- a/gpt4all-backend/src/llmodel.cpp
+++ b/gpt4all-backend/src/llmodel.cpp
--- a/gpt4all-backend/src/llmodel_c.cpp
+++ b/gpt4all-backend/src/llmodel_c.cpp
@ -12,6 +12,7 @@
 #include <memory>
 #include <optional>
 #include <string>
 #include <string_view>
 #include <vector>
 struct LLModelWrapper {
@ -106,7 +107,7 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
                    const char *prompt_template,
                    llmodel_prompt_callback prompt_callback,
                    llmodel_response_callback response_callback,
-                    llmodel_recalculate_callback recalculate_callback,
+                    bool allow_context_shift,
                    llmodel_prompt_context *ctx,
                    bool special,
                    const char *fake_reply)
@ -130,13 +131,10 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
    wrapper->promptContext.repeat_last_n = ctx->repeat_last_n;
    wrapper->promptContext.contextErase = ctx->context_erase;
    std::string fake_reply_str;
    if (fake_reply) { fake_reply_str = fake_reply; }
    auto *fake_reply_p = fake_reply ? &fake_reply_str : nullptr;
    // Call the C++ prompt method
-    wrapper->llModel->prompt(prompt, prompt_template, prompt_callback, response_func, recalculate_callback,
+    wrapper->llModel->prompt(prompt, prompt_template, prompt_callback, response_func, allow_context_shift,
-                             wrapper->promptContext, special, fake_reply_p);
+                             wrapper->promptContext, special,
                             fake_reply ? std::make_optional<std::string_view>(fake_reply) : std::nullopt);
    // Update the C context by giving access to the wrappers raw pointers to std::vector data
    // which involves no copies
--- a/gpt4all-backend/src/llmodel_shared.cpp
+++ b/gpt4all-backend/src/llmodel_shared.cpp
@ -0,0 +1,405 @@
 #include "llmodel.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <iostream>
 #include <optional>
 #include <regex>
 #include <sstream>
 #include <stdexcept>
 #include <string>
 #include <string_view>
 #include <vector>
 namespace ranges = std::ranges;
 static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch> &placeholders, std::string &err)
 {
    static const std::regex placeholderRegex(R"(%[1-2](?![0-9]))");
    auto it = std::sregex_iterator(tmpl.begin(), tmpl.end(), placeholderRegex);
    placeholders.clear();
    placeholders.insert(placeholders.end(), it, std::sregex_iterator());
    if (placeholders.size() > 2) {
        err = "ERROR: expected at most two placeholders, got " + std::to_string(placeholders.size());
        return false;
    }
    if (placeholders.size() >= 1 && placeholders[0].str() != "%1") {
        err = "ERROR: first placeholder must be %1, got " + placeholders[0].str();
        return false;
    }
    if (placeholders.size() >= 2 && placeholders[1].str() != "%2") {
        err = "ERROR: second placeholder must be %2, got " + placeholders[1].str();
        return false;
    }
    return true;
 }
 void LLModel::prompt(const std::string &prompt,
                     const std::string &promptTemplate,
                     std::function<bool(int32_t)> promptCallback,
                     std::function<bool(int32_t, const std::string&)> responseCallback,
                     bool allowContextShift,
                     PromptContext &promptCtx,
                     bool special,
                     std::optional<std::string_view> fakeReply)
 {
    if (!isModelLoaded()) {
        std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
        return;
    }
    if (!supportsCompletion()) {
        std::string errorMessage = "ERROR: this model does not support text completion or chat!";
        responseCallback(-1, errorMessage);
        std::cerr << implementation().modelType() << " " << errorMessage << "\n";
        return;
    }
    // sanity checks
    if (promptCtx.n_past > contextLength()) {
        std::ostringstream ss;
        ss << "n_past=" << promptCtx.n_past << " is past end of context length=" << contextLength();
        throw std::out_of_range(ss.str());
    }
    if (promptCtx.n_past > promptCtx.tokens.size()) {
        std::ostringstream ss;
        ss << "n_past=" << promptCtx.n_past << " is past end of token cache length=" << promptCtx.tokens.size();
        throw std::out_of_range(ss.str());
    }
    promptCtx.n_ctx = contextLength();
    promptCtx.n_batch = std::min(promptCtx.n_batch, LLMODEL_MAX_PROMPT_BATCH);
    if (promptCtx.n_past < promptCtx.tokens.size())
        promptCtx.tokens.resize(promptCtx.n_past);
    m_tokenize_last_token = promptCtx.tokens.empty() ? -1 : promptCtx.tokens.back(); // not serialized
    // parse the prompt template
    std::vector<std::smatch> placeholders;
    {
        std::string err;
        if (!parsePromptTemplate(promptTemplate, placeholders, err)) {
            responseCallback(-1, err);
            std::cerr << err << "\n";
            return;
        }
    }
    auto old_n_past = promptCtx.n_past; // prepare to fake n_past for tokenize
    // tokenize the user prompt
    std::vector<Token> embd_inp;
    if (placeholders.empty()) {
        // this is unusual, but well-defined
        std::cerr << __func__ << ": prompt template has no placeholder\n";
        embd_inp = tokenize(promptCtx, promptTemplate, true);
    } else {
        // template: beginning of user prompt
        const auto &phUser = placeholders[0];
        std::string userPrefix(phUser.prefix());
        if (!userPrefix.empty()) {
            embd_inp = tokenize(promptCtx, userPrefix, true);
            promptCtx.n_past += embd_inp.size();
        }
        // user input (shouldn't have special token processing)
        auto tokens = tokenize(promptCtx, prompt, special);
        embd_inp.insert(embd_inp.end(), tokens.begin(), tokens.end());
        promptCtx.n_past += tokens.size();
        // template: end of user prompt + start of assistant prompt
        size_t start = phUser.position() + phUser.length();
        size_t end = placeholders.size() >= 2 ? placeholders[1].position() : promptTemplate.length();
        auto userToAsst = promptTemplate.substr(start, end - start);
        if (!userToAsst.empty()) {
            tokens = tokenize(promptCtx, userToAsst, true);
            embd_inp.insert(embd_inp.end(), tokens.begin(), tokens.end());
            promptCtx.n_past += tokens.size();
        }
    }
    promptCtx.n_past = old_n_past; // restore n_past so decodePrompt can increment it
    // decode the user prompt
    if (!decodePrompt(promptCallback, responseCallback, allowContextShift, promptCtx, embd_inp))
        return; // error
    // decode the assistant's reply, either generated or spoofed
    if (!fakeReply) {
        generateResponse(responseCallback, allowContextShift, promptCtx);
    } else {
        embd_inp = tokenize(promptCtx, *fakeReply, false);
        if (!decodePrompt(promptCallback, responseCallback, allowContextShift, promptCtx, embd_inp, true))
            return; // error
    }
    // decode the rest of the prompt template
    // template: end of assistant prompt
    std::string asstSuffix;
    if (placeholders.size() >= 2) {
        size_t start = placeholders[1].position() + placeholders[1].length();
        asstSuffix = promptTemplate.substr(start);
    } else {
        asstSuffix = "\n\n"; // default to a blank link, good for e.g. Alpaca
    }
    if (!asstSuffix.empty()) {
        embd_inp = tokenize(promptCtx, asstSuffix, true);
        decodePrompt(promptCallback, responseCallback, allowContextShift, promptCtx, embd_inp);
    }
 }
 // returns false on error
 bool LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
                           std::function<bool(int32_t, const std::string&)> responseCallback,
                           bool allowContextShift,
                           PromptContext &promptCtx,
                           std::vector<Token> embd_inp,
                           bool isResponse) {
    if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
        responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
        std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
            " tokens and the context window is " << promptCtx.n_ctx << "!\n";
        return false;
    }
    // FIXME(jared): There are mitigations for this situation, such as making room before
    // copying the prompt context, or restoring the KV cache when we restore the prompt
    // context.
    if (!allowContextShift && promptCtx.n_past + embd_inp.size() > promptCtx.n_ctx) {
        std::cerr << "LLModel Warning: Not enough space, n_past=" << promptCtx.n_past << ", n_eval=" << embd_inp.size()
                  << ", n_ctx=" << promptCtx.n_ctx << "\n";
        return false;
    }
    // process the prompt in batches
    size_t i = 0;
    while (i < embd_inp.size()) {
        size_t batch_end = std::min(i + promptCtx.n_batch, embd_inp.size());
        std::vector<Token> batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);
        // Check if the context has run out...
        if (promptCtx.n_past + int32_t(batch.size()) > promptCtx.n_ctx) {
            assert(allowContextShift);
            shiftContext(promptCtx);
            assert(promptCtx.n_past + int32_t(batch.size()) <= promptCtx.n_ctx);
        }
        if (!evalTokens(promptCtx, batch)) {
            std::cerr << implementation().modelType() << " ERROR: Failed to process prompt\n";
            return false;
        }
        size_t tokens = batch_end - i;
        for (size_t t = 0; t < tokens; ++t) {
            promptCtx.tokens.push_back(batch.at(t));
            promptCtx.n_past += 1;
            Token tok = batch.at(t);
            bool res = isResponse ? responseCallback(tok, tokenToString(tok)) : promptCallback(tok);
            if (!res)
                return false;
        }
        i = batch_end;
    }
    return true;
 }
 /*
 * If string s overlaps with the string key such that some prefix of the key is at the end
 * of the string, return the position in s where the first match starts. Otherwise, return
 * std::string::npos. Examples:
 * s = "bfo",  key = "foo" -> 1
 * s = "fooa", key = "foo" -> npos
 */
 static std::string::size_type stringsOverlap(const std::string &s, const std::string &key)
 {
    if (s.empty() || key.empty())
        throw std::invalid_argument("arguments to stringsOverlap must not be empty");
    for (int start = std::max(0, int(s.size()) - int(key.size())); start < s.size(); start++) {
        if (s.compare(start, s.size(), key, 0, s.size() - start) == 0)
            return start;
    }
    return std::string::npos;
 }
 void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
                               bool allowContextShift,
                               PromptContext &promptCtx) {
    static const char *stopSequences[] {
        "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context",
    };
    // Don't even start if there is no room
    if (!promptCtx.n_predict)
        return;
    if (!allowContextShift && promptCtx.n_past >= promptCtx.n_ctx) {
        std::cerr << "LLModel Warning: Not enough space, n_past=" << promptCtx.n_past << ", n_ctx=" << promptCtx.n_ctx
                  << "\n";
        return;
    }
    std::string cachedResponse;
    std::vector<Token> cachedTokens;
    int n_predicted = 0;
    // Predict next tokens
    for (bool stop = false; !stop;) {
        // Sample next token
        std::optional<Token> new_tok = sampleToken(promptCtx);
        std::string new_piece = tokenToString(new_tok.value());
        cachedTokens.push_back(new_tok.value());
        cachedResponse += new_piece;
        auto accept = [this, &promptCtx, &cachedTokens, &new_tok, allowContextShift]() -> bool {
            // Shift context if out of space
            if (promptCtx.n_past >= promptCtx.n_ctx) {
                (void)allowContextShift;
                assert(allowContextShift);
                shiftContext(promptCtx);
                assert(promptCtx.n_past < promptCtx.n_ctx);
            }
            // Accept the token
            Token tok = std::exchange(new_tok, std::nullopt).value();
            if (!evalTokens(promptCtx, { tok })) {
                // TODO(jared): raise an exception
                std::cerr << implementation().modelType() << " ERROR: Failed to predict next token\n";
                return false;
            }
            promptCtx.tokens.push_back(tok);
            promptCtx.n_past += 1;
            return true;
        };
        // Check for EOS
        auto lengthLimit = std::string::npos;
        for (const auto token : endTokens()) {
            if (new_tok == token) {
                stop = true;
                lengthLimit = cachedResponse.size() - new_piece.size();
            }
        }
        if (lengthLimit != std::string::npos) {
            // EOS matched
        } else if (!isSpecialToken(new_tok.value())) {
            // Check if the response contains a stop sequence
            for (const auto &p : stopSequences) {
                auto match = cachedResponse.find(p);
                if (match != std::string::npos) stop = true;
                lengthLimit = std::min(lengthLimit, match);
                if (match == 0) break;
            }
            // Check if the response matches the start of a stop sequence
            if (lengthLimit == std::string::npos) {
                for (const auto &p : stopSequences) {
                    auto match = stringsOverlap(cachedResponse, p);
                    lengthLimit = std::min(lengthLimit, match);
                    if (match == 0) break;
                }
            }
        } else if (ranges::find(stopSequences, new_piece) < std::end(stopSequences)) {
            // Special tokens must exactly match a stop sequence
            stop = true;
            lengthLimit = cachedResponse.size() - new_piece.size();
        }
        // Optionally stop if the context will run out
        if (!allowContextShift && promptCtx.n_past + cachedTokens.size() >= promptCtx.n_ctx) {
            std::cerr << "LLModel Warning: Not enough space, n_past=" << promptCtx.n_past << ", n_ctx="
                      << promptCtx.n_ctx << "\n";
            stop = true;
        }
        // Empty the cache, up to the length limit
        std::string::size_type responseLength = 0;
        while (!cachedTokens.empty()) {
            Token tok = cachedTokens.front();
            std::string piece = tokenToString(tok);
            // Stop if the piece (or part of it) does not fit within the length limit
            if (responseLength + (stop ? 1 : piece.size()) > lengthLimit)
                break;
            // Remove token from cache
            assert(cachedResponse.starts_with(piece));
            cachedTokens.erase(cachedTokens.begin(), cachedTokens.begin() + 1);
            cachedResponse.erase(cachedResponse.begin(), cachedResponse.begin() + piece.size());
            // Accept the token, if needed (not cached)
            if (cachedTokens.empty() && new_tok && !accept())
                return;
            // Send the token
            if (!responseCallback(tok, piece) || ++n_predicted >= promptCtx.n_predict) {
                stop = true;
                break;
            }
            // FIXME(jared): we could avoid printing partial stop sequences if we didn't have to
            // output token IDs and could cache a partial token for the next prompt call
            responseLength += piece.size();
        }
        assert(cachedTokens.empty() == cachedResponse.empty());
        // Accept the token, if needed (in cache)
        if (new_tok) {
            assert(!cachedTokens.empty() && cachedTokens.back() == new_tok);
            if (stop) {
                cachedTokens.pop_back();
            } else if (!accept()) {
                return;
            }
        }
    }
    auto &tokens = promptCtx.tokens;
    if (tokens.size() < cachedTokens.size()) {
        /* This is theoretically possible if the longest stop sequence is greater than
         * n_ctx * contextErase tokens. */
        throw std::runtime_error("shifted too much context, can't go back");
    }
    auto discard_start = tokens.end() - cachedTokens.size();
    assert(std::equal(discard_start, tokens.end(), cachedTokens.begin()));
    tokens.erase(discard_start, tokens.end());
    promptCtx.n_past -= cachedTokens.size();
 }
 void LLModel::embed(
    const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
    size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb
 ) {
    (void)texts;
    (void)embeddings;
    (void)prefix;
    (void)dimensionality;
    (void)tokenCount;
    (void)doMean;
    (void)atlas;
    (void)cancelCb;
    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
 }
 void LLModel::embed(
    const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
    bool doMean, bool atlas
 ) {
    (void)texts;
    (void)embeddings;
    (void)isRetrieval;
    (void)dimensionality;
    (void)tokenCount;
    (void)doMean;
    (void)atlas;
    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
 }
--- a/gpt4all-backend/src/llmodel_shared.h
+++ b/gpt4all-backend/src/llmodel_shared.h
--- a/gpt4all-backend/src/utils.cpp
+++ b/gpt4all-backend/src/utils.cpp
--- a/gpt4all-backend/src/utils.h
+++ b/gpt4all-backend/src/utils.h
--- a/gpt4all-bindings/python/CHANGELOG.md
+++ b/gpt4all-bindings/python/CHANGELOG.md
@ -4,6 +4,33 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 ## [Unreleased]
 ### Added
 - Warn on Windows if the Microsoft Visual C++ runtime libraries are not found ([#2920](https://github.com/nomic-ai/gpt4all/pull/2920))
 ## [2.8.2] - 2024-08-14
 ### Fixed
 - Fixed incompatibility with Python 3.8 since v2.7.0 and Python <=3.11 since v2.8.1 ([#2871](https://github.com/nomic-ai/gpt4all/pull/2871))
 ## [2.8.1] - 2024-08-13
 ### Added
 - Use greedy sampling when temperature is set to zero ([#2854](https://github.com/nomic-ai/gpt4all/pull/2854))
 ### Changed
 - Search for pip-installed CUDA 11 as well as CUDA 12 ([#2802](https://github.com/nomic-ai/gpt4all/pull/2802))
 - Stop shipping CUBINs to reduce wheel size ([#2802](https://github.com/nomic-ai/gpt4all/pull/2802))
 - Use llama\_kv\_cache ops to shift context faster ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
 - Don't stop generating at end of context ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
 ### Fixed
 - Make reverse prompt detection work more reliably and prevent it from breaking output ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
 - Explicitly target macOS 12.6 in CI to fix Metal compatibility on older macOS ([#2849](https://github.com/nomic-ai/gpt4all/pull/2849))
 - Do not initialize Vulkan driver when only using CPU ([#2843](https://github.com/nomic-ai/gpt4all/pull/2843))
 - Fix a segfault on exit when using CPU mode on Linux with NVIDIA and EGL ([#2843](https://github.com/nomic-ai/gpt4all/pull/2843))
 ## [2.8.0] - 2024-08-05
 ### Added
@ -16,6 +43,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 - Detect use of a Python interpreter under Rosetta for a clearer error message ([#2793](https://github.com/nomic-ai/gpt4all/pull/2793))
 ### Changed
 - Build against CUDA 11.8 instead of CUDA 12 for better compatibility with older drivers ([#2639](https://github.com/nomic-ai/gpt4all/pull/2639))
 - Update llama.cpp to commit 87e397d00 from July 19th ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
 ### Removed
@ -33,4 +61,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
  - Restore leading space removal logic that was incorrectly removed in [#2694](https://github.com/nomic-ai/gpt4all/pull/2694)
  - CUDA: Cherry-pick llama.cpp DMMV cols requirement fix that caused a crash with long conversations since [#2694](https://github.com/nomic-ai/gpt4all/pull/2694)
 [Unreleased]: https://github.com/nomic-ai/gpt4all/compare/python-v2.8.2...HEAD
 [2.8.2]: https://github.com/nomic-ai/gpt4all/compare/python-v2.8.1...python-v2.8.2
 [2.8.1]: https://github.com/nomic-ai/gpt4all/compare/python-v2.8.0...python-v2.8.1
 [2.8.0]: https://github.com/nomic-ai/gpt4all/compare/python-v2.7.0...python-v2.8.0
--- a/gpt4all-bindings/python/docs/gpt4all_help/troubleshooting.md
+++ b/gpt4all-bindings/python/docs/gpt4all_help/troubleshooting.md
@ -4,7 +4,7 @@
 It is possible you are trying to load a model from HuggingFace whose weights are not compatible with our [backend](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings).
-Try downloading one of the officially supported models mentioned our [website](https://gpt4all.io/). If the problem persists, please share your experience on our [Discord](https://discord.com/channels/1076964370942267462).
+Try downloading one of the officially supported models listed on the main models page in the application. If the problem persists, please share your experience on our [Discord](https://discord.com/channels/1076964370942267462).
 ## Bad Responses 
--- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@ -37,27 +37,49 @@ if platform.system() == "Darwin" and platform.processor() == "i386":
        raise RuntimeError(textwrap.dedent("""\
            Running GPT4All under Rosetta is not supported due to CPU feature requirements.
            Please install GPT4All in an environment that uses a native ARM64 Python interpreter.
-        """))
+        """).strip())
 # Check for C++ runtime libraries
 if platform.system() == "Windows":
    try:
        ctypes.CDLL("msvcp140.dll")
        ctypes.CDLL("vcruntime140.dll")
        ctypes.CDLL("vcruntime140_1.dll")
    except OSError as e:
        print(textwrap.dedent(f"""\
            {e!r}
            The Microsoft Visual C++ runtime libraries were not found. Please install them from
            https://aka.ms/vs/17/release/vc_redist.x64.exe
        """), file=sys.stderr)
 def _load_cuda(rtver: str, blasver: str) -> None:
    if platform.system() == "Linux":
        cudalib   = f"lib/libcudart.so.{rtver}"
        cublaslib = f"lib/libcublas.so.{blasver}"
    else:  # Windows
        cudalib   = fr"bin\cudart64_{rtver.replace('.', '')}.dll"
        cublaslib = fr"bin\cublas64_{blasver}.dll"
    # preload the CUDA libs so the backend can find them
    ctypes.CDLL(os.path.join(cuda_runtime.__path__[0], cudalib), mode=ctypes.RTLD_GLOBAL)
    ctypes.CDLL(os.path.join(cublas.__path__[0], cublaslib), mode=ctypes.RTLD_GLOBAL)
 # Find CUDA libraries from the official packages
 cuda_found = False
-if platform.system() in ('Linux', 'Windows'):
+if platform.system() in ("Linux", "Windows"):
    try:
        from nvidia import cuda_runtime, cublas
    except ImportError:
        pass  # CUDA is optional
    else:
-        if platform.system() == 'Linux':
+        for rtver, blasver in [("12", "12"), ("11.0", "11")]:
-            cudalib   = 'lib/libcudart.so.12'
+            try:
-            cublaslib = 'lib/libcublas.so.12'
+                _load_cuda(rtver, blasver)
        else:  # Windows
            cudalib   = r'bin\cudart64_12.dll'
            cublaslib = r'bin\cublas64_12.dll'
        # preload the CUDA libs so the backend can find them
        ctypes.CDLL(os.path.join(cuda_runtime.__path__[0], cudalib), mode=ctypes.RTLD_GLOBAL)
        ctypes.CDLL(os.path.join(cublas.__path__[0], cublaslib), mode=ctypes.RTLD_GLOBAL)
                cuda_found = True
            except OSError:  # dlopen() does not give specific error codes
                pass  # try the next one
 # TODO: provide a config file to make this more robust
@ -128,7 +150,6 @@ llmodel.llmodel_isModelLoaded.restype = ctypes.c_bool
 PromptCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_int32)
 ResponseCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_int32, ctypes.c_char_p)
 RecalculateCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_bool)
 EmbCancelCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.POINTER(ctypes.c_uint), ctypes.c_uint, ctypes.c_char_p)
 llmodel.llmodel_prompt.argtypes = [
@ -137,7 +158,7 @@ llmodel.llmodel_prompt.argtypes = [
    ctypes.c_char_p,
    PromptCallback,
    ResponseCallback,
-    RecalculateCallback,
+    ctypes.c_bool,
    ctypes.POINTER(LLModelPromptContext),
    ctypes.c_bool,
    ctypes.c_char_p,
@ -513,7 +534,7 @@ class LLModel:
            ctypes.c_char_p(prompt_template.encode()),
            PromptCallback(self._prompt_callback),
            ResponseCallback(self._callback_decoder(callback)),
-            RecalculateCallback(self._recalculate_callback),
+            True,
            self.context,
            special,
            ctypes.c_char_p(),
@ -606,8 +627,3 @@ class LLModel:
    @staticmethod
    def _prompt_callback(token_id: int) -> bool:
        return True
    # Empty recalculate callback
    @staticmethod
    def _recalculate_callback(is_recalculating: bool) -> bool:
        return is_recalculating
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@ -209,27 +209,27 @@ class GPT4All:
        self._current_prompt_template: str = "{0}"
        device_init = None
-        if sys.platform == 'darwin':
+        if sys.platform == "darwin":
            if device is None:
-                backend = 'auto'  # 'auto' is effectively 'metal' due to currently non-functional fallback
+                backend = "auto"  # "auto" is effectively "metal" due to currently non-functional fallback
-            elif device == 'cpu':
+            elif device == "cpu":
-                backend = 'cpu'
+                backend = "cpu"
            else:
-                if platform.machine() != 'arm64' or device != 'gpu':
+                if platform.machine() != "arm64" or device != "gpu":
-                    raise ValueError(f'Unknown device for this platform: {device}')
+                    raise ValueError(f"Unknown device for this platform: {device}")
-                backend = 'metal'
+                backend = "metal"
        else:
-            backend = 'kompute'
+            backend = "kompute"
-            if device is None or device == 'cpu':
+            if device is None or device == "cpu":
                pass  # use kompute with no device
-            elif device in ('cuda', 'kompute'):
+            elif device in ("cuda", "kompute"):
                backend = device
-                device_init = 'gpu'
+                device_init = "gpu"
-            elif device.startswith('cuda:'):
+            elif device.startswith("cuda:"):
-                backend = 'cuda'
+                backend = "cuda"
-                device_init = device.removeprefix('cuda:')
+                device_init = _remove_prefix(device, "cuda:")
            else:
-                device_init = device.removeprefix('kompute:')
+                device_init = _remove_prefix(device, "kompute:")
        # Retrieve model and download if allowed
        self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
@ -357,7 +357,7 @@ class GPT4All:
        expected_md5: str | None = None,
    ) -> str | os.PathLike[str]:
        """
-        Download model from https://gpt4all.io.
+        Download model from gpt4all.io.
        Args:
            model_filename: Filename of model (with .gguf extension).
@ -706,3 +706,7 @@ def _fsync(fd: int | _HasFileno) -> None:
        else:
            return
    os.fsync(fd)
 def _remove_prefix(s: str, prefix: str) -> str:
    return s[len(prefix):] if s.startswith(prefix) else s
--- a/gpt4all-bindings/python/setup.py
+++ b/gpt4all-bindings/python/setup.py
@ -68,16 +68,17 @@ def get_long_description():
 setup(
    name=package_name,
-    version="2.8.0",
+    version="2.8.3.dev0",
    description="Python bindings for GPT4All",
    long_description=get_long_description(),
    long_description_content_type="text/markdown",
    author="Nomic and the Open Source Community",
    author_email="support@nomic.ai",
-    url="https://gpt4all.io/",
+    url="https://www.nomic.ai/gpt4all",
    project_urls={
        "Documentation": "https://docs.gpt4all.io/gpt4all_python.html",
        "Source code": "https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python",
        "Changelog": "https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-bindings/python/CHANGELOG.md",
    },
    classifiers = [
        "Programming Language :: Python :: 3",
@ -94,8 +95,8 @@ setup(
    ],
    extras_require={
        'cuda': [
-            'nvidia-cuda-runtime-cu12',
+            'nvidia-cuda-runtime-cu11',
-            'nvidia-cublas-cu12',
+            'nvidia-cublas-cu11',
        ],
        'all': [
            'gpt4all[cuda]; platform_system == "Windows" or platform_system == "Linux"',
--- a/gpt4all-chat/CHANGELOG.md
+++ b/gpt4all-chat/CHANGELOG.md
@ -0,0 +1,124 @@
 # Changelog
 All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 ## [Unreleased]
 ### Added
 - Use greedy sampling when temperature is set to zero ([#2854](https://github.com/nomic-ai/gpt4all/pull/2854))
 - Use configured system prompt in server mode and ignore system messages ([#2921](https://github.com/nomic-ai/gpt4all/pull/2921), [#2924](https://github.com/nomic-ai/gpt4all/pull/2924))
 - Add more system information to anonymous usage stats ([#2939](https://github.com/nomic-ai/gpt4all/pull/2939))
 - Check for unsupported Ubuntu and macOS versions at install time ([#2940](https://github.com/nomic-ai/gpt4all/pull/2940))
 ### Changed
 - The offline update button now directs users to the offline installer releases page. (by [@3Simplex](https://github.com/3Simplex) in [#2888](https://github.com/nomic-ai/gpt4all/pull/2888))
 - Change the website link on the home page to point to the new URL ([#2915](https://github.com/nomic-ai/gpt4all/pull/2915))
 - Smaller default window size, dynamic minimum size, and scaling tweaks ([#2904](https://github.com/nomic-ai/gpt4all/pull/2904))
 - Only allow a single instance of program to be run at a time ([#2923](https://github.com/nomic-ai/gpt4all/pull/2923]))
 ### Fixed
 - Bring back "Auto" option for Embeddings Device as "Application default," which went missing in v3.1.0 ([#2873](https://github.com/nomic-ai/gpt4all/pull/2873))
 - Correct a few strings in the Italian translation (by [@Harvester62](https://github.com/Harvester62) in [#2872](https://github.com/nomic-ai/gpt4all/pull/2872) and [#2909](https://github.com/nomic-ai/gpt4all/pull/2909))
 - Correct typos in Traditional Chinese translation (by [@supersonictw](https://github.com/supersonictw) in [#2852](https://github.com/nomic-ai/gpt4all/pull/2852))
 - Set the window icon on Linux ([#2880](https://github.com/nomic-ai/gpt4all/pull/2880))
 - Corrections to the Romanian translation (by [@SINAPSA-IC](https://github.com/SINAPSA-IC) in [#2890](https://github.com/nomic-ai/gpt4all/pull/2890))
 - Fix singular/plural forms of LocalDocs "x Sources" (by [@cosmic-snow](https://github.com/cosmic-snow) in [#2885](https://github.com/nomic-ai/gpt4all/pull/2885))
 - Fix a typo in Model Settings (by [@3Simplex](https://github.com/3Simplex) in [#2916](https://github.com/nomic-ai/gpt4all/pull/2916))
 - Fix the antenna icon tooltip when using the local server ([#2922](https://github.com/nomic-ai/gpt4all/pull/2922))
 - Fix a few issues with locating files and handling errors when loading remote models on startup ([#2875](https://github.com/nomic-ai/gpt4all/pull/2875))
 - Significantly improve API server request parsing and response correctness ([#2929](https://github.com/nomic-ai/gpt4all/pull/2929))
 - Removed unnecessary dependency on Qt WaylandCompositor module ([#2949](https://github.com/nomic-ai/gpt4all/pull/2949))
 ## [3.2.1] - 2024-08-13
 ### Fixed
 - Do not initialize Vulkan driver when only using CPU ([#2843](https://github.com/nomic-ai/gpt4all/pull/2843))
 - Fix a potential crash on exit when using only CPU on Linux with NVIDIA (does not affect X11) ([#2843](https://github.com/nomic-ai/gpt4all/pull/2843))
 - Fix default CUDA architecture list after [#2802](https://github.com/nomic-ai/gpt4all/pull/2802) ([#2855](https://github.com/nomic-ai/gpt4all/pull/2855))
 ## [3.2.0] - 2024-08-12
 ### Added
 - Add Qwen2-1.5B-Instruct to models3.json (by [@ThiloteE](https://github.com/ThiloteE) in [#2759](https://github.com/nomic-ai/gpt4all/pull/2759))
 - Enable translation feature for seven languages: English, Spanish, Italian, Portuguese, Chinese Simplified, Chinese Traditional, Romanian ([#2830](https://github.com/nomic-ai/gpt4all/pull/2830))
 ### Changed
 - Add missing entries to Italian transltation (by [@Harvester62](https://github.com/Harvester62) in [#2783](https://github.com/nomic-ai/gpt4all/pull/2783))
 - Use llama\_kv\_cache ops to shift context faster ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
 - Don't stop generating at end of context ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
 ### Fixed
 - Case-insensitive LocalDocs source icon detection (by [@cosmic-snow](https://github.com/cosmic-snow) in [#2761](https://github.com/nomic-ai/gpt4all/pull/2761))
 - Fix comparison of pre- and post-release versions for update check and models3.json ([#2762](https://github.com/nomic-ai/gpt4all/pull/2762), [#2772](https://github.com/nomic-ai/gpt4all/pull/2772))
 - Fix several backend issues ([#2778](https://github.com/nomic-ai/gpt4all/pull/2778))
  - Restore leading space removal logic that was incorrectly removed in [#2694](https://github.com/nomic-ai/gpt4all/pull/2694)
  - CUDA: Cherry-pick llama.cpp DMMV cols requirement fix that caused a crash with long conversations since [#2694](https://github.com/nomic-ai/gpt4all/pull/2694)
 - Make reverse prompt detection work more reliably and prevent it from breaking output ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
 - Disallow context shift for chat name and follow-up generation to prevent bugs ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
 - Explicitly target macOS 12.6 in CI to fix Metal compatibility on older macOS ([#2846](https://github.com/nomic-ai/gpt4all/pull/2846))
 ## [3.1.1] - 2024-07-27
 ### Added
 - Add Llama 3.1 8B Instruct to models3.json (by [@3Simplex](https://github.com/3Simplex) in [#2731](https://github.com/nomic-ai/gpt4all/pull/2731) and [#2732](https://github.com/nomic-ai/gpt4all/pull/2732))
 - Portuguese (BR) translation (by [thiagojramos](https://github.com/thiagojramos) in [#2733](https://github.com/nomic-ai/gpt4all/pull/2733))
 - Support adding arbitrary OpenAI-compatible models by URL (by [@supersonictw](https://github.com/supersonictw) in [#2683](https://github.com/nomic-ai/gpt4all/pull/2683))
 - Support Llama 3.1 RoPE scaling ([#2758](https://github.com/nomic-ai/gpt4all/pull/2758))
 ### Changed
 - Add missing entries to Chinese (Simplified) translation (by [wuodoo](https://github.com/wuodoo) in [#2716](https://github.com/nomic-ai/gpt4all/pull/2716) and [#2749](https://github.com/nomic-ai/gpt4all/pull/2749))
 - Update translation files and add missing paths to CMakeLists.txt ([#2735](https://github.com/nomic-ai/gpt4all/2735))
 ## [3.1.0] - 2024-07-24
 ### Added
 - Generate suggested follow-up questions ([#2634](https://github.com/nomic-ai/gpt4all/pull/2634), [#2723](https://github.com/nomic-ai/gpt4all/pull/2723))
  - Also add options for the chat name and follow-up question prompt templates
 - Scaffolding for translations ([#2612](https://github.com/nomic-ai/gpt4all/pull/2612))
 - Spanish (MX) translation (by [@jstayco](https://github.com/jstayco) in [#2654](https://github.com/nomic-ai/gpt4all/pull/2654))
 - Chinese (Simplified) translation by mikage ([#2657](https://github.com/nomic-ai/gpt4all/pull/2657))
 - Dynamic changes of language and locale at runtime ([#2659](https://github.com/nomic-ai/gpt4all/pull/2659), [#2677](https://github.com/nomic-ai/gpt4all/pull/2677))
 - Romanian translation by [@SINAPSA\_IC](https://github.com/SINAPSA_IC) ([#2662](https://github.com/nomic-ai/gpt4all/pull/2662))
 - Chinese (Traditional) translation (by [@supersonictw](https://github.com/supersonictw) in [#2661](https://github.com/nomic-ai/gpt4all/pull/2661))
 - Italian translation (by [@Harvester62](https://github.com/Harvester62) in [#2700](https://github.com/nomic-ai/gpt4all/pull/2700))
 ### Changed
 - Customize combo boxes and context menus to fit the new style ([#2535](https://github.com/nomic-ai/gpt4all/pull/2535))
 - Improve view bar scaling and Model Settings layout ([#2520](https://github.com/nomic-ai/gpt4all/pull/2520)
 - Make the logo spin while the model is generating ([#2557](https://github.com/nomic-ai/gpt4all/pull/2557))
 - Server: Reply to wrong GET/POST method with HTTP 405 instead of 404 (by [@cosmic-snow](https://github.com/cosmic-snow) in [#2615](https://github.com/nomic-ai/gpt4all/pull/2615))
 - Update theme for menus (by [@3Simplex](https://github.com/3Simplex) in [#2578](https://github.com/nomic-ai/gpt4all/pull/2578))
 - Move the "stop" button to the message box ([#2561](https://github.com/nomic-ai/gpt4all/pull/2561))
 - Build with CUDA 11.8 for better compatibility ([#2639](https://github.com/nomic-ai/gpt4all/pull/2639))
 - Make links in latest news section clickable ([#2643](https://github.com/nomic-ai/gpt4all/pull/2643))
 - Support translation of settings choices ([#2667](https://github.com/nomic-ai/gpt4all/pull/2667), [#2690](https://github.com/nomic-ai/gpt4all/pull/2690))
 - Improve LocalDocs view's error message (by @cosmic-snow in [#2679](https://github.com/nomic-ai/gpt4all/pull/2679))
 - Ignore case of LocalDocs file extensions ([#2642](https://github.com/nomic-ai/gpt4all/pull/2642), [#2684](https://github.com/nomic-ai/gpt4all/pull/2684))
 - Update llama.cpp to commit 87e397d00 from July 19th ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694), [#2702](https://github.com/nomic-ai/gpt4all/pull/2702))
  - Add support for GPT-NeoX, Gemma 2, OpenELM, ChatGLM, and Jais architectures (all with Vulkan support)
  - Add support for DeepSeek-V2 architecture (no Vulkan support)
  - Enable Vulkan support for StarCoder2, XVERSE, Command R, and OLMo
 - Show scrollbar in chat collections list as needed (by [@cosmic-snow](https://github.com/cosmic-snow) in [#2691](https://github.com/nomic-ai/gpt4all/pull/2691))
 ### Removed
 - Remove support for GPT-J models ([#2676](https://github.com/nomic-ai/gpt4all/pull/2676), [#2693](https://github.com/nomic-ai/gpt4all/pull/2693))
 ### Fixed
 - Fix placement of thumbs-down and datalake opt-in dialogs ([#2540](https://github.com/nomic-ai/gpt4all/pull/2540))
 - Select the correct folder with the Linux fallback folder dialog ([#2541](https://github.com/nomic-ai/gpt4all/pull/2541))
 - Fix clone button sometimes producing blank model info ([#2545](https://github.com/nomic-ai/gpt4all/pull/2545))
 - Fix jerky chat view scrolling ([#2555](https://github.com/nomic-ai/gpt4all/pull/2555))
 - Fix "reload" showing for chats with missing models ([#2520](https://github.com/nomic-ai/gpt4all/pull/2520)
 - Fix property binding loop warning ([#2601](https://github.com/nomic-ai/gpt4all/pull/2601))
 - Fix UI hang with certain chat view content ([#2543](https://github.com/nomic-ai/gpt4all/pull/2543))
 - Fix crash when Kompute falls back to CPU ([#2640](https://github.com/nomic-ai/gpt4all/pull/2640))
 - Fix several Vulkan resource management issues ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
 - Fix crash/hang when some models stop generating, by showing special tokens ([#2701](https://github.com/nomic-ai/gpt4all/pull/2701))
 [Unreleased]: https://github.com/nomic-ai/gpt4all/compare/v3.2.1...HEAD
 [3.2.1]: https://github.com/nomic-ai/gpt4all/compare/v3.2.0...v3.2.1
 [3.2.0]: https://github.com/nomic-ai/gpt4all/compare/v3.1.1...v3.2.0
 [3.1.1]: https://github.com/nomic-ai/gpt4all/compare/v3.1.0...v3.1.1
 [3.1.0]: https://github.com/nomic-ai/gpt4all/compare/v3.0.0...v3.1.0
--- a/gpt4all-chat/CMakeLists.txt
+++ b/gpt4all-chat/CMakeLists.txt
@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.16)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD 23)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 if(APPLE)
@ -17,7 +17,7 @@ if(APPLE)
 endif()
 set(APP_VERSION_MAJOR 3)
-set(APP_VERSION_MINOR 1)
+set(APP_VERSION_MINOR 2)
 set(APP_VERSION_PATCH 2)
 set(APP_VERSION_BASE "${APP_VERSION_MAJOR}.${APP_VERSION_MINOR}.${APP_VERSION_PATCH}")
 set(APP_VERSION "${APP_VERSION_BASE}-dev0")
@ -31,10 +31,8 @@ project(gpt4all VERSION ${APP_VERSION_BASE} LANGUAGES CXX C)
 set(CMAKE_AUTOMOC ON)
 set(CMAKE_AUTORCC ON)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
-option(GPT4ALL_TRANSLATIONS OFF "Build with translations")
+option(GPT4ALL_LOCALHOST "Build installer for localhost repo" OFF)
 option(GPT4ALL_LOCALHOST OFF "Build installer for localhost repo")
 option(GPT4ALL_OFFLINE_INSTALLER "Build an offline installer" OFF)
 option(GPT4ALL_SIGN_INSTALL "Sign installed binaries and installers (requires signing identities)" OFF)
@ -44,11 +42,7 @@ configure_file(
  "${CMAKE_CURRENT_BINARY_DIR}/config.h"
 )
-if(LINUX)
+find_package(Qt6 6.4 COMPONENTS Core HttpServer LinguistTools Pdf Quick QuickDialogs2 Sql Svg REQUIRED)
  find_package(Qt6 6.4 COMPONENTS Core Quick WaylandCompositor QuickDialogs2 Svg HttpServer Sql Pdf LinguistTools REQUIRED)
 else()
  find_package(Qt6 6.4 COMPONENTS Core Quick QuickDialogs2 Svg HttpServer Sql Pdf LinguistTools REQUIRED)
 endif()
 # Get the Qt6Core target properties
 get_target_property(Qt6Core_INCLUDE_DIRS Qt6::Core INTERFACE_INCLUDE_DIRECTORIES)
@ -66,13 +60,19 @@ message(STATUS "Qt 6 root directory: ${Qt6_ROOT_DIR}")
 set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(FMT_INSTALL OFF)
 set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}")
 set(BUILD_SHARED_LIBS OFF)
 add_subdirectory(deps/fmt)
 set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}")
 add_subdirectory(../gpt4all-backend llmodel)
 set(CHAT_EXE_RESOURCES)
 # Metal shader library
 if (APPLE)
-    list(APPEND CHAT_EXE_RESOURCES "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib")
+    list(APPEND CHAT_EXE_RESOURCES "${GGML_METALLIB}")
 endif()
 # App icon
@ -107,75 +107,13 @@ if (APPLE)
    list(APPEND CHAT_EXE_RESOURCES "${LOCAL_EMBEDDING_MODEL_PATH}")
 endif()
-qt_add_executable(chat
+set(QAPPLICATION_CLASS QGuiApplication)
-    main.cpp
+add_subdirectory(deps/SingleApplication)
-    chat.h chat.cpp
+add_subdirectory(src)
    chatllm.h chatllm.cpp
    chatmodel.h chatlistmodel.h chatlistmodel.cpp
    chatapi.h chatapi.cpp
    chatviewtextprocessor.h chatviewtextprocessor.cpp
    database.h database.cpp
    download.h download.cpp
    embllm.cpp embllm.h
    localdocs.h localdocs.cpp localdocsmodel.h localdocsmodel.cpp
    llm.h llm.cpp
    modellist.h modellist.cpp
    mysettings.h mysettings.cpp
    network.h network.cpp
    server.h server.cpp
    logger.h logger.cpp
    ${APP_ICON_RESOURCE}
    ${CHAT_EXE_RESOURCES}
 )
-qt_add_qml_module(chat
+target_sources(chat PRIVATE ${APP_ICON_RESOURCE} ${CHAT_EXE_RESOURCES})
-    URI gpt4all
+
-    VERSION 1.0
+qt_target_qml_sources(chat
    NO_CACHEGEN
    QML_FILES
      main.qml
      qml/AddCollectionView.qml
      qml/AddModelView.qml
      qml/ApplicationSettings.qml
      qml/ChatDrawer.qml
      qml/ChatView.qml
      qml/CollectionsDrawer.qml
      qml/HomeView.qml
      qml/LocalDocsSettings.qml
      qml/LocalDocsView.qml
      qml/ModelSettings.qml
      qml/ModelsView.qml
      qml/NetworkDialog.qml
      qml/NewVersionDialog.qml
      qml/PopupDialog.qml
      qml/SettingsView.qml
      qml/StartupDialog.qml
      qml/SwitchModelDialog.qml
      qml/Theme.qml
      qml/ThumbsDownDialog.qml
      qml/Toast.qml
      qml/ToastManager.qml
      qml/MyBusyIndicator.qml
      qml/MyButton.qml
      qml/MyCheckBox.qml
      qml/MyComboBox.qml
      qml/MyDialog.qml
      qml/MyDirectoryField.qml
      qml/MyFancyLink.qml
      qml/MyMenu.qml
      qml/MyMenuItem.qml
      qml/MyMiniButton.qml
      qml/MySettingsButton.qml
      qml/MySettingsDestructiveButton.qml
      qml/MySettingsLabel.qml
      qml/MySettingsStack.qml
      qml/MySettingsTab.qml
      qml/MySlug.qml
      qml/MyTextArea.qml
      qml/MyTextButton.qml
      qml/MyTextField.qml
      qml/MyToolButton.qml
      qml/MyWelcomeButton.qml
    RESOURCES
      icons/antenna_1.svg
      icons/antenna_2.svg
@ -229,10 +167,9 @@ qt_add_qml_module(chat
      icons/you.svg
 )
 if (GPT4ALL_TRANSLATIONS)
 qt_add_translations(chat
    TS_FILES
-        ${CMAKE_SOURCE_DIR}/translations/gpt4all_en.ts
+    ${CMAKE_SOURCE_DIR}/translations/gpt4all_en_US.ts
    ${CMAKE_SOURCE_DIR}/translations/gpt4all_es_MX.ts
    ${CMAKE_SOURCE_DIR}/translations/gpt4all_zh_CN.ts
    ${CMAKE_SOURCE_DIR}/translations/gpt4all_zh_TW.ts
@ -240,7 +177,6 @@ if (GPT4ALL_TRANSLATIONS)
    ${CMAKE_SOURCE_DIR}/translations/gpt4all_it_IT.ts
    ${CMAKE_SOURCE_DIR}/translations/gpt4all_pt_BR.ts
 )
 endif()
 set_target_properties(chat PROPERTIES
    WIN32_EXECUTABLE TRUE
@ -290,21 +226,18 @@ endif()
 target_compile_definitions(chat
    PRIVATE $<$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>:QT_QML_DEBUG>)
 target_include_directories(chat PRIVATE src)
 # usearch uses the identifier 'slots' which conflicts with Qt's 'slots' keyword
 target_compile_definitions(chat PRIVATE QT_NO_SIGNALS_SLOTS_KEYWORDS)
-target_include_directories(chat PRIVATE usearch/include
+target_include_directories(chat PRIVATE deps/usearch/include
-                                        usearch/fp16/include)
+                                        deps/usearch/fp16/include)
 if(LINUX)
 target_link_libraries(chat
-      PRIVATE Qt6::Quick Qt6::Svg Qt6::HttpServer Qt6::Sql Qt6::Pdf Qt6::WaylandCompositor)
+    PRIVATE Qt6::Core Qt6::HttpServer Qt6::Pdf Qt6::Quick Qt6::Sql Qt6::Svg)
 else()
 target_link_libraries(chat
-    PRIVATE Qt6::Quick Qt6::Svg Qt6::HttpServer Qt6::Sql Qt6::Pdf)
+    PRIVATE llmodel SingleApplication fmt::fmt)
 endif()
 target_link_libraries(chat
    PRIVATE llmodel)
 # -- install --
@ -388,7 +321,7 @@ if (LLMODEL_CUDA)
 endif()
 if (NOT APPLE)
-    install(FILES "${CMAKE_BINARY_DIR}/resources/${LOCAL_EMBEDDING_MODEL}"
+    install(FILES "${LOCAL_EMBEDDING_MODEL_PATH}"
            DESTINATION resources
            COMPONENT ${COMPONENT_NAME_MAIN})
 endif()
@ -431,7 +364,7 @@ set(CPACK_PACKAGE_INSTALL_DIRECTORY ${COMPONENT_NAME_MAIN})
 set(CPACK_PACKAGE_VERSION_MAJOR ${PROJECT_VERSION_MAJOR})
 set(CPACK_PACKAGE_VERSION_MINOR ${PROJECT_VERSION_MINOR})
 SET(CPACK_PACKAGE_VERSION_PATCH ${PROJECT_VERSION_PATCH})
-set(CPACK_PACKAGE_HOMEPAGE_URL "https://gpt4all.io")
+set(CPACK_PACKAGE_HOMEPAGE_URL "https://www.nomic.ai/gpt4all")
 set(CPACK_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-48.png")
 set(CPACK_RESOURCE_FILE_LICENSE ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE)
 set(CPACK_RESOURCE_FILE_README ${CMAKE_CURRENT_SOURCE_DIR}/README.md)
@ -440,11 +373,12 @@ set(CPACK_CREATE_DESKTOP_LINKS "GPT4All")
 set(CPACK_IFW_PACKAGE_NAME "GPT4All")
 set(CPACK_IFW_PACKAGE_TITLE "GPT4All Installer")
 set(CPACK_IFW_PACKAGE_PUBLISHER "Nomic, Inc.")
-set(CPACK_IFW_PRODUCT_URL "https://gpt4all.io")
+set(CPACK_IFW_PRODUCT_URL "https://www.nomic.ai/gpt4all")
 set(CPACK_IFW_PACKAGE_WIZARD_STYLE "Aero")
 set(CPACK_IFW_PACKAGE_LOGO "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-48.png")
 set(CPACK_IFW_PACKAGE_WINDOW_ICON "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-32.png")
 set(CPACK_IFW_PACKAGE_WIZARD_SHOW_PAGE_LIST OFF)
 set(CPACK_IFW_PACKAGE_CONTROL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/cmake/installer_control.qs")
 include(InstallRequiredSystemLibraries)
 include(CPack)
@ -457,7 +391,7 @@ endif()
 cpack_ifw_configure_component(${COMPONENT_NAME_MAIN} ESSENTIAL FORCED_INSTALLATION)
 cpack_ifw_configure_component(${COMPONENT_NAME_MAIN} VERSION ${APP_VERSION})
 cpack_ifw_configure_component(${COMPONENT_NAME_MAIN} LICENSES "MIT LICENSE" ${CPACK_RESOURCE_FILE_LICENSE})
-cpack_ifw_configure_component(${COMPONENT_NAME_MAIN} SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/cmake/installerscript.qs")
+cpack_ifw_configure_component(${COMPONENT_NAME_MAIN} SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/cmake/installer_component.qs")
 cpack_ifw_configure_component(${COMPONENT_NAME_MAIN} REPLACES "gpt4all-chat") #Was used in very earliest prototypes
 if (GPT4ALL_LOCALHOST)
--- a/gpt4all-chat/README.md
+++ b/gpt4all-chat/README.md
@ -11,7 +11,7 @@ GPT-J model by following build instructions below.
 ## Install
-One click installers for macOS, Linux, and Windows at https://gpt4all.io
+One click installers for macOS, Linux, and Windows at https://www.nomic.ai/gpt4all
 ## Features
--- a/gpt4all-chat/build_and_run.md
+++ b/gpt4all-chat/build_and_run.md
@ -12,21 +12,21 @@ On Windows and Linux, building GPT4All with full GPU support requires the [Vulka
 ## Note for Linux users
-Linux users may install Qt via their distro's official packages instead of using the Qt installer. You need at least Qt 6.5, with support for QPdf and the Qt HTTP Server. It should be straightforward to build with just cmake and make, but you may continue to follow these instructions to build with Qt Creator.
+Linux users may install Qt via their distro's official packages instead of using the Qt installer. You need at least Qt 6.5, with support for QPdf and the Qt HTTP Server. You may build from the CLI using CMake and Ninja, or with Qt Creator as described later in this document.
 On Arch Linux, this looks like:
 ```
-sudo pacman -S --needed base-devel qt6-base qt6-declarative qt6-wayland qt6-svg qt6-httpserver qt6-webengine qt6-5compat qt6-shadertools qtcreator cmake ninja
+sudo pacman -S --needed cmake gcc ninja qt6-5compat qt6-base qt6-declarative qt6-httpserver qt6-svg qtcreator
 ```
 On Ubuntu 23.04, this looks like:
 ```
-sudo apt install build-essential qt6-base-dev qt6-declarative-dev qt6-wayland-dev qt6-svg-dev qt6-httpserver-dev qt6-webengine-dev libqt6core5compat6 qml6-module-qt5compat-graphicaleffects libqt6shadertools6 qtcreator cmake ninja-build
+sudo apt install cmake g++ libgl-dev libqt6core5compat6 ninja-build qml6-module-qt5compat-graphicaleffects qt6-base-dev qt6-declarative-dev qt6-httpserver-dev qt6-svg-dev qtcreator
 ```
 On Fedora 39, this looks like:
 ```
-sudo dnf install make gcc gcc-c++ qt6-qtbase-devel qt6-qtdeclarative-devel qt6-qtwayland-devel qt6-qtsvg-devel qt6-qthttpserver-devel qt6-qtwebengine-devel qt6-qt5compat qt5-qtgraphicaleffects qt6-qtshadertools qt-creator cmake ninja-build
+sudo dnf install cmake gcc-c++ ninja-build qt-creator qt5-qtgraphicaleffects qt6-qt5compat qt6-qtbase-devel qt6-qtdeclarative-devel qt6-qthttpserver-devel qt6-qtsvg-devel
 ```
 ## Download Qt
@ -49,10 +49,7 @@ Under this release (e.g. Qt 6.5.0), select the target platform:
 - On Windows, it is called "MSVC 2019 64-bit" (for 64-bit x86 CPUs). MinGW has not been tested.
 Under this release, select the following additional components:
 - Qt Quick 3D
 - Qt Wayland Compositor (for Linux only)
 - Qt 5 Compatibility Module
 - Qt Shader Tools
 - Additional Libraries:
  - Qt HTTP Server
  - Qt PDF
--- a/gpt4all-chat/cmake/Modules/SignWindowsBinaries.cmake
+++ b/gpt4all-chat/cmake/Modules/SignWindowsBinaries.cmake
@ -3,7 +3,7 @@ function(sign_target_windows tgt)
        add_custom_command(TARGET ${tgt}
            POST_BUILD
            COMMAND AzureSignTool.exe sign
-                -du "https://gpt4all.io/index.html"
+                -du "https://www.nomic.ai/gpt4all"
                -kvu https://gpt4all.vault.azure.net
                -kvi "$Env{AZSignGUID}"
                -kvs "$Env{AZSignPWD}"
--- a/gpt4all-chat/cmake/installer_component.qs
+++ b/gpt4all-chat/cmake/installer_component.qs
@ -6,8 +6,7 @@ Component.prototype.beginInstallation = function() {
    targetDirectory = installer.value("TargetDir");
 };
-Component.prototype.createOperations = function()
+Component.prototype.createOperations = function() {
 {
    try {
        // call the base create operations function
        component.createOperations();
@ -30,7 +29,7 @@ Component.prototype.createOperations = function()
                "workingDirectory=" + targetDirectory + "/bin",
                "iconPath=" + targetDirectory + "/gpt4all.ico",
                "iconId=0", "description=Open GPT4All");
-        } else if (systemInfo.productType === "macos" || systemInfo.productType === "osx") {
+        } else if (systemInfo.productType === "macos") {
            var gpt4allAppPath = targetDirectory + "/bin/gpt4all.app";
            var symlinkPath = targetDirectory + "/../GPT4All.app";
            // Remove the symlink if it already exists
@ -56,7 +55,7 @@ Component.prototype.createOperationsForArchive = function(archive)
 {
    component.createOperationsForArchive(archive);
-    if (systemInfo.productType === "macos" || systemInfo.productType === "osx") {
+    if (systemInfo.productType === "macos") {
        var uninstallTargetDirectory = installer.value("TargetDir");
        var symlinkPath = uninstallTargetDirectory + "/../GPT4All.app";
--- a/gpt4all-chat/cmake/installer_control.qs
+++ b/gpt4all-chat/cmake/installer_control.qs
@ -0,0 +1,44 @@
 var finishedText = null;
 function cancelInstaller(message) {
    installer.setDefaultPageVisible(QInstaller.Introduction,         false);
    installer.setDefaultPageVisible(QInstaller.TargetDirectory,      false);
    installer.setDefaultPageVisible(QInstaller.ComponentSelection,   false);
    installer.setDefaultPageVisible(QInstaller.ReadyForInstallation, false);
    installer.setDefaultPageVisible(QInstaller.StartMenuSelection,   false);
    installer.setDefaultPageVisible(QInstaller.PerformInstallation,  false);
    installer.setDefaultPageVisible(QInstaller.LicenseCheck,         false);
    finishedText = message;
    installer.setCanceled();
 }
 function vercmp(a, b) {
    return a.localeCompare(b, undefined, { numeric: true, sensitivity: "base" });
 }
 function Controller() {
 }
 Controller.prototype.TargetDirectoryPageCallback = function() {
    var failedReq = null;
    if (systemInfo.productType === "ubuntu" && vercmp(systemInfo.productVersion, "22.04") < 0) {
        failedReq = "Ubuntu 22.04 LTS";
    } else if (systemInfo.productType === "macos" && vercmp(systemInfo.productVersion, "12.6") < 0) {
        failedReq = "macOS Monterey 12.6";
    }
    if (failedReq !== null) {
        cancelInstaller(
            "Installation cannot continue because GPT4All does not support your operating system: " +
            `${systemInfo.prettyProductName}<br/><br/>` +
            `GPT4All requires ${failedReq} or newer.`
        );
    }
 }
 Controller.prototype.FinishedPageCallback = function() {
    const widget = gui.currentPageWidget();
    if (widget != null && finishedText != null) {
        widget.MessageLabel.setText(finishedText);
    }
 }
--- a/gpt4all-chat/deps/SingleApplication
+++ b/gpt4all-chat/deps/SingleApplication
@ -0,0 +1 @@
 Subproject commit 21bdef01eddcbd78044eea1d50b9dee08d218ff2
--- a/gpt4all-chat/deps/fmt
+++ b/gpt4all-chat/deps/fmt
@ -0,0 +1 @@
 Subproject commit 0c9fce2ffefecfdce794e1859584e25877b7b592
--- a/gpt4all-chat/deps/usearch
+++ b/gpt4all-chat/deps/usearch
--- a/gpt4all-chat/flatpak-manifest/io.gpt4all.gpt4all.appdata.xml
+++ b/gpt4all-chat/flatpak-manifest/io.gpt4all.gpt4all.appdata.xml
@ -32,7 +32,7 @@
            <image>https://raw.githubusercontent.com/nomic-ai/gpt4all/main/gpt4all-chat/flatpak-manifest/screenshots/model.png</image>
        </screenshot>
    </screenshots>
-    <url type="homepage">https://gpt4all.io</url>
+    <url type="homepage">https://www.nomic.ai/gpt4all</url>
    <url type="bugtracker">https://github.com/nomic-ai/gpt4all/issues</url>
    <url type="vcs-browser">https://github.com/nomic-ai/gpt4all</url>
    <releases>
--- a/gpt4all-chat/metadata/latestnews.md
+++ b/gpt4all-chat/metadata/latestnews.md
@ -1,6 +1,10 @@
 ## Latest News
-* **New Model Support**: LLaMa 3.1 8b, Gemma, Mixtral, GPT-NeoX, Gemma 2, OpenELM, ChatGLM, Jais architectures, StarCoder2, XVERSE, Command R, and OLMo (all with Vulkan support)
+Version 3.2.1 has now been released which fixes an issue with poor quality responses on NVIDIA GPUs in 3.2.0. The new 3.2 minor version brings:
 * **Suggested Follow Up Questions**: Get follow up questions on your LocalDocs or chats automatically suggested
-Roadmap: we're planning support for tools in GPT4All that models like LLaMa 3.1 can use. Share suggestions on Discord!
+* **Official Language Translations**: Translations for Simplified Chinese, Traditional Chinese, Italian, Portuguese, Romanian, and Spanish.<br/>
  Go to Settings > Language and Locale to change the application language.
 * **Context Window Improvements**: Significantly faster context recalculation when context runs out
 * **Bugfixes**: Models no longer stop generating when they run out of context
 Also, Qwen2-1.5B-Instruct was recently added to the model list, which has good Chinese support.
--- a/gpt4all-chat/metadata/release.json
+++ b/gpt4all-chat/metadata/release.json
@ -953,6 +953,53 @@
 * Jared Van Bortel (Nomic AI)
 * Shiranui (@supersonictw)
 * Community (beta testers, bug reporters, bindings authors)
 "
  },
  {
    "version": "3.2.0",
    "notes":
 "
 <b>&mdash; What's New &mdash;</b>
 * Translations for Simplified Chinese, Traditional Chinese, Italian, Portuguese, Romanian, and Spanish
 * Significantly faster context recalculation when context runs out
 * Models no longer stop generating when they run out of context
 * Add Qwen2-1.5B-Instruct to the model list
 <b>&mdash; Fixes &mdash;</b>
 * Fix a CUDA crash with long conversations since v3.1.0
 * Fix \"file(s)\" and \"word(s)\" appearing in UI instead of proper plurals
 * Show the correct icons for LocalDocs sources with uppercase extensions
 * More reliable reverse prompt detection
 * Fix a minor prompting issue introduced in v3.1.0
 * Disallow context shift for chat name and follow-up generation
 * Fix potential incompatibility with macOS 12 and 13
 ",
    "contributors":
 "
 * Jared Van Bortel (Nomic AI)
 * Adam Treat (Nomic AI)
 * Riccardo Giovanetti (`@Harvester62`)
 * Victor Emanuel (`@SINAPSA-IC`)
 * Jeremy Tayco (`@jstayco`)
 * Shiranui (`@supersonictw`)
 * Thiago Ramos (`@thiagojramos`)
 * ThiloteE (`@ThiloteE`)
 * Dominik (`@cosmic-snow`)
 * Jack (`@wuodoo`)
 * Community (beta testers, bug reporters, bindings authors)
 "
  },
  {
    "version": "3.2.1",
    "notes":
 "
 <b>&mdash; Fixes &mdash;</b>
 * Fix a potential Vulkan crash on application exit on some Linux systems
 * Fix a bad CUDA build option that led to gibberish on newer NVIDIA GPUs
 ",
    "contributors":
 "
 * Jared Van Bortel (Nomic AI)
 "
  }
 ]
--- a/gpt4all-chat/server.cpp
+++ b/gpt4all-chat/server.cpp
@ -1,468 +0,0 @@
 #include "server.h"
 #include "chat.h"
 #include "modellist.h"
 #include "mysettings.h"
 #include <QByteArray>
 #include <QDateTime>
 #include <QDebug>
 #include <QHostAddress>
 #include <QHttpServer>
 #include <QHttpServerResponder>
 #include <QJsonArray>
 #include <QJsonDocument>
 #include <QJsonObject>
 #include <QJsonValue>
 #include <QPair>
 #include <Qt>
 #include <QtLogging>
 #include <iostream>
 #include <string>
 #include <type_traits>
 #include <utility>
 using namespace Qt::Literals::StringLiterals;
 //#define DEBUG
 static inline QJsonObject modelToJson(const ModelInfo &info)
 {
    QJsonObject model;
    model.insert("id", info.name());
    model.insert("object", "model");
    model.insert("created", 0);
    model.insert("owned_by", "humanity");
    model.insert("root", info.name());
    model.insert("parent", QJsonValue::Null);
    QJsonArray permissions;
    QJsonObject permissionObj;
    permissionObj.insert("id", "foobarbaz");
    permissionObj.insert("object", "model_permission");
    permissionObj.insert("created", 0);
    permissionObj.insert("allow_create_engine", false);
    permissionObj.insert("allow_sampling", false);
    permissionObj.insert("allow_logprobs", false);
    permissionObj.insert("allow_search_indices", false);
    permissionObj.insert("allow_view", true);
    permissionObj.insert("allow_fine_tuning", false);
    permissionObj.insert("organization", "*");
    permissionObj.insert("group", QJsonValue::Null);
    permissionObj.insert("is_blocking", false);
    permissions.append(permissionObj);
    model.insert("permissions", permissions);
    return model;
 }
 static inline QJsonObject resultToJson(const ResultInfo &info)
 {
    QJsonObject result;
    result.insert("file", info.file);
    result.insert("title", info.title);
    result.insert("author", info.author);
    result.insert("date", info.date);
    result.insert("text", info.text);
    result.insert("page", info.page);
    result.insert("from", info.from);
    result.insert("to", info.to);
    return result;
 }
 Server::Server(Chat *chat)
    : ChatLLM(chat, true /*isServer*/)
    , m_chat(chat)
    , m_server(nullptr)
 {
    connect(this, &Server::threadStarted, this, &Server::start);
    connect(this, &Server::databaseResultsChanged, this, &Server::handleDatabaseResultsChanged);
    connect(chat, &Chat::collectionListChanged, this, &Server::handleCollectionListChanged, Qt::QueuedConnection);
 }
 Server::~Server()
 {
 }
 void Server::start()
 {
    m_server = new QHttpServer(this);
    if (!m_server->listen(QHostAddress::LocalHost, MySettings::globalInstance()->networkPort())) {
        qWarning() << "ERROR: Unable to start the server";
        return;
    }
    m_server->route("/v1/models", QHttpServerRequest::Method::Get,
        [](const QHttpServerRequest &request) {
            if (!MySettings::globalInstance()->serverChat())
                return QHttpServerResponse(QHttpServerResponder::StatusCode::Unauthorized);
            const QList<ModelInfo> modelList = ModelList::globalInstance()->selectableModelList();
            QJsonObject root;
            root.insert("object", "list");
            QJsonArray data;
            for (const ModelInfo &info : modelList) {
                Q_ASSERT(info.installed);
                if (!info.installed)
                    continue;
                data.append(modelToJson(info));
            }
            root.insert("data", data);
            return QHttpServerResponse(root);
        }
    );
    m_server->route("/v1/models/<arg>", QHttpServerRequest::Method::Get,
        [](const QString &model, const QHttpServerRequest &request) {
            if (!MySettings::globalInstance()->serverChat())
                return QHttpServerResponse(QHttpServerResponder::StatusCode::Unauthorized);
            const QList<ModelInfo> modelList = ModelList::globalInstance()->selectableModelList();
            QJsonObject object;
            for (const ModelInfo &info : modelList) {
                Q_ASSERT(info.installed);
                if (!info.installed)
                    continue;
                if (model == info.name()) {
                    object = modelToJson(info);
                    break;
                }
            }
            return QHttpServerResponse(object);
        }
    );
    m_server->route("/v1/completions", QHttpServerRequest::Method::Post,
        [this](const QHttpServerRequest &request) {
            if (!MySettings::globalInstance()->serverChat())
                return QHttpServerResponse(QHttpServerResponder::StatusCode::Unauthorized);
            return handleCompletionRequest(request, false);
        }
    );
    m_server->route("/v1/chat/completions", QHttpServerRequest::Method::Post,
        [this](const QHttpServerRequest &request) {
            if (!MySettings::globalInstance()->serverChat())
                return QHttpServerResponse(QHttpServerResponder::StatusCode::Unauthorized);
            return handleCompletionRequest(request, true);
        }
    );
    // Respond with code 405 to wrong HTTP methods:
    m_server->route("/v1/models",  QHttpServerRequest::Method::Post,
        [](const QHttpServerRequest &request) {
            if (!MySettings::globalInstance()->serverChat())
                return QHttpServerResponse(QHttpServerResponder::StatusCode::Unauthorized);
            return QHttpServerResponse(
                QJsonDocument::fromJson("{\"error\": {\"message\": \"Not allowed to POST on /v1/models."
                    " (HINT: Perhaps you meant to use a different HTTP method?)\","
                    " \"type\": \"invalid_request_error\", \"param\": null, \"code\": null}}").object(),
                QHttpServerResponder::StatusCode::MethodNotAllowed);
        }
    );
    m_server->route("/v1/models/<arg>", QHttpServerRequest::Method::Post,
        [](const QString &model, const QHttpServerRequest &request) {
            if (!MySettings::globalInstance()->serverChat())
                return QHttpServerResponse(QHttpServerResponder::StatusCode::Unauthorized);
            return QHttpServerResponse(
                QJsonDocument::fromJson("{\"error\": {\"message\": \"Not allowed to POST on /v1/models/*."
                    " (HINT: Perhaps you meant to use a different HTTP method?)\","
                    " \"type\": \"invalid_request_error\", \"param\": null, \"code\": null}}").object(),
                QHttpServerResponder::StatusCode::MethodNotAllowed);
        }
    );
    m_server->route("/v1/completions", QHttpServerRequest::Method::Get,
        [](const QHttpServerRequest &request) {
            if (!MySettings::globalInstance()->serverChat())
                return QHttpServerResponse(QHttpServerResponder::StatusCode::Unauthorized);
            return QHttpServerResponse(
                QJsonDocument::fromJson("{\"error\": {\"message\": \"Only POST requests are accepted.\","
                    " \"type\": \"invalid_request_error\", \"param\": null, \"code\": \"method_not_supported\"}}").object(),
                QHttpServerResponder::StatusCode::MethodNotAllowed);
        }
    );
    m_server->route("/v1/chat/completions", QHttpServerRequest::Method::Get,
        [](const QHttpServerRequest &request) {
            if (!MySettings::globalInstance()->serverChat())
                return QHttpServerResponse(QHttpServerResponder::StatusCode::Unauthorized);
            return QHttpServerResponse(
                QJsonDocument::fromJson("{\"error\": {\"message\": \"Only POST requests are accepted.\","
                    " \"type\": \"invalid_request_error\", \"param\": null, \"code\": \"method_not_supported\"}}").object(),
                QHttpServerResponder::StatusCode::MethodNotAllowed);
        }
    );
    m_server->afterRequest([] (QHttpServerResponse &&resp) {
        resp.addHeader("Access-Control-Allow-Origin", "*");
        return std::move(resp);
    });
    connect(this, &Server::requestServerNewPromptResponsePair, m_chat,
        &Chat::serverNewPromptResponsePair, Qt::BlockingQueuedConnection);
 }
 QHttpServerResponse Server::handleCompletionRequest(const QHttpServerRequest &request, bool isChat)
 {
    // We've been asked to do a completion...
    QJsonParseError err;
    const QJsonDocument document = QJsonDocument::fromJson(request.body(), &err);
    if (err.error || !document.isObject()) {
        std::cerr << "ERROR: invalid json in completions body" << std::endl;
        return QHttpServerResponse(QHttpServerResponder::StatusCode::NoContent);
    }
 #if defined(DEBUG)
    printf("/v1/completions %s\n", qPrintable(document.toJson(QJsonDocument::Indented)));
    fflush(stdout);
 #endif
    const QJsonObject body = document.object();
    if (!body.contains("model")) { // required
        std::cerr << "ERROR: completions contains no model" << std::endl;
        return QHttpServerResponse(QHttpServerResponder::StatusCode::NoContent);
    }
    QJsonArray messages;
    if (isChat) {
        if (!body.contains("messages")) {
            std::cerr << "ERROR: chat completions contains no messages" << std::endl;
            return QHttpServerResponse(QHttpServerResponder::StatusCode::NoContent);
        }
        messages = body["messages"].toArray();
    }
    const QString modelRequested = body["model"].toString();
    ModelInfo modelInfo = ModelList::globalInstance()->defaultModelInfo();
    const QList<ModelInfo> modelList = ModelList::globalInstance()->selectableModelList();
    for (const ModelInfo &info : modelList) {
        Q_ASSERT(info.installed);
        if (!info.installed)
            continue;
        if (modelRequested == info.name() || modelRequested == info.filename()) {
            modelInfo = info;
            break;
        }
    }
    // We only support one prompt for now
    QList<QString> prompts;
    if (body.contains("prompt")) {
        QJsonValue promptValue = body["prompt"];
        if (promptValue.isString())
            prompts.append(promptValue.toString());
        else {
            QJsonArray array = promptValue.toArray();
            for (const QJsonValue &v : array)
                prompts.append(v.toString());
        }
    } else
        prompts.append(" ");
    int max_tokens = 16;
    if (body.contains("max_tokens"))
        max_tokens = body["max_tokens"].toInt();
    float temperature = 1.f;
    if (body.contains("temperature"))
        temperature = body["temperature"].toDouble();
    float top_p = 1.f;
    if (body.contains("top_p"))
        top_p = body["top_p"].toDouble();
    float min_p = 0.f;
    if (body.contains("min_p"))
        min_p = body["min_p"].toDouble();
    int n = 1;
    if (body.contains("n"))
        n = body["n"].toInt();
    int logprobs = -1; // supposed to be null by default??
    if (body.contains("logprobs"))
        logprobs = body["logprobs"].toInt();
    bool echo = false;
    if (body.contains("echo"))
        echo = body["echo"].toBool();
    // We currently don't support any of the following...
 #if 0
    // FIXME: Need configurable reverse prompts
    QList<QString> stop;
    if (body.contains("stop")) {
        QJsonValue stopValue = body["stop"];
        if (stopValue.isString())
            stop.append(stopValue.toString());
        else {
            QJsonArray array = stopValue.toArray();
            for (QJsonValue v : array)
                stop.append(v.toString());
        }
    }
    // FIXME: QHttpServer doesn't support server-sent events
    bool stream = false;
    if (body.contains("stream"))
        stream = body["stream"].toBool();
    // FIXME: What does this do?
    QString suffix;
    if (body.contains("suffix"))
        suffix = body["suffix"].toString();
    // FIXME: We don't support
    float presence_penalty = 0.f;
    if (body.contains("presence_penalty"))
        top_p = body["presence_penalty"].toDouble();
    // FIXME: We don't support
    float frequency_penalty = 0.f;
    if (body.contains("frequency_penalty"))
        top_p = body["frequency_penalty"].toDouble();
    // FIXME: We don't support
    int best_of = 1;
    if (body.contains("best_of"))
        logprobs = body["best_of"].toInt();
    // FIXME: We don't need
    QString user;
    if (body.contains("user"))
        suffix = body["user"].toString();
 #endif
    QString actualPrompt = prompts.first();
    // if we're a chat completion we have messages which means we need to prepend these to the prompt
    if (!messages.isEmpty()) {
        QList<QString> chats;
        for (int i = 0; i < messages.count();  ++i) {
            QJsonValue v = messages.at(i);
            QString content = v.toObject()["content"].toString();
            if (!content.endsWith("\n") && i < messages.count() - 1)
                content += "\n";
            chats.append(content);
        }
        actualPrompt.prepend(chats.join("\n"));
    }
    // adds prompt/response items to GUI
    emit requestServerNewPromptResponsePair(actualPrompt); // blocks
    // load the new model if necessary
    setShouldBeLoaded(true);
    if (modelInfo.filename().isEmpty()) {
        std::cerr << "ERROR: couldn't load default model " << modelRequested.toStdString() << std::endl;
        return QHttpServerResponse(QHttpServerResponder::StatusCode::BadRequest);
    } else if (!loadModel(modelInfo)) {
        std::cerr << "ERROR: couldn't load model " << modelInfo.name().toStdString() << std::endl;
        return QHttpServerResponse(QHttpServerResponder::StatusCode::InternalServerError);
    }
    // don't remember any context
    resetContext();
    const QString promptTemplate    = modelInfo.promptTemplate();
    const float top_k               = modelInfo.topK();
    const int n_batch               = modelInfo.promptBatchSize();
    const float repeat_penalty      = modelInfo.repeatPenalty();
    const int repeat_last_n         = modelInfo.repeatPenaltyTokens();
    int promptTokens = 0;
    int responseTokens = 0;
    QList<QPair<QString, QList<ResultInfo>>> responses;
    for (int i = 0; i < n; ++i) {
        if (!promptInternal(
            m_collections,
            actualPrompt,
            promptTemplate,
            max_tokens /*n_predict*/,
            top_k,
            top_p,
            min_p,
            temperature,
            n_batch,
            repeat_penalty,
            repeat_last_n)) {
            std::cerr << "ERROR: couldn't prompt model " << modelInfo.name().toStdString() << std::endl;
            return QHttpServerResponse(QHttpServerResponder::StatusCode::InternalServerError);
        }
        QString echoedPrompt = actualPrompt;
        if (!echoedPrompt.endsWith("\n"))
            echoedPrompt += "\n";
        responses.append(qMakePair((echo ? u"%1\n"_s.arg(actualPrompt) : QString()) + response(), m_databaseResults));
        if (!promptTokens)
            promptTokens += m_promptTokens;
        responseTokens += m_promptResponseTokens - m_promptTokens;
        if (i != n - 1)
            resetResponse();
    }
    QJsonObject responseObject;
    responseObject.insert("id", "foobarbaz");
    responseObject.insert("object", "text_completion");
    responseObject.insert("created", QDateTime::currentSecsSinceEpoch());
    responseObject.insert("model", modelInfo.name());
    QJsonArray choices;
    if (isChat) {
        int index = 0;
        for (const auto &r : responses) {
            QString result = r.first;
            QList<ResultInfo> infos = r.second;
            QJsonObject choice;
            choice.insert("index", index++);
            choice.insert("finish_reason", responseTokens == max_tokens ? "length" : "stop");
            QJsonObject message;
            message.insert("role", "assistant");
            message.insert("content", result);
            choice.insert("message", message);
            if (MySettings::globalInstance()->localDocsShowReferences()) {
                QJsonArray references;
                for (const auto &ref : infos)
                    references.append(resultToJson(ref));
                choice.insert("references", references);
            }
            choices.append(choice);
        }
    } else {
        int index = 0;
        for (const auto &r : responses) {
            QString result = r.first;
            QList<ResultInfo> infos = r.second;
            QJsonObject choice;
            choice.insert("text", result);
            choice.insert("index", index++);
            choice.insert("logprobs", QJsonValue::Null); // We don't support
            choice.insert("finish_reason", responseTokens == max_tokens ? "length" : "stop");
            if (MySettings::globalInstance()->localDocsShowReferences()) {
                QJsonArray references;
                for (const auto &ref : infos)
                    references.append(resultToJson(ref));
                choice.insert("references", references);
            }
            choices.append(choice);
        }
    }
    responseObject.insert("choices", choices);
    QJsonObject usage;
    usage.insert("prompt_tokens", int(promptTokens));
    usage.insert("completion_tokens", int(responseTokens));
    usage.insert("total_tokens", int(promptTokens + responseTokens));
    responseObject.insert("usage", usage);
 #if defined(DEBUG)
    QJsonDocument newDoc(responseObject);
    printf("/v1/completions %s\n", qPrintable(newDoc.toJson(QJsonDocument::Indented)));
    fflush(stdout);
 #endif
    return QHttpServerResponse(responseObject);
 }
--- a/gpt4all-chat/src/CMakeLists.txt
+++ b/gpt4all-chat/src/CMakeLists.txt
@ -0,0 +1,72 @@
 set_source_files_properties("${GGML_METALLIB}" PROPERTIES GENERATED ON)
 qt_add_executable(chat
    main.cpp
    chat.cpp chat.h
    chatapi.cpp chatapi.h
    chatlistmodel.cpp chatlistmodel.h
    chatllm.cpp chatllm.h
    chatmodel.h
    chatviewtextprocessor.cpp chatviewtextprocessor.h
    database.cpp database.h
    download.cpp download.h
    embllm.cpp embllm.h
    llm.cpp llm.h
    localdocs.cpp localdocs.h
    localdocsmodel.cpp localdocsmodel.h
    logger.cpp logger.h
    modellist.cpp modellist.h
    mysettings.cpp mysettings.h
    network.cpp network.h
    server.cpp server.h
 )
 qt_add_qml_module(chat
    URI gpt4all
    VERSION 1.0
    NO_CACHEGEN
    QML_FILES
      main.qml
      qml/AddCollectionView.qml
      qml/AddModelView.qml
      qml/ApplicationSettings.qml
      qml/ChatDrawer.qml
      qml/ChatView.qml
      qml/CollectionsDrawer.qml
      qml/HomeView.qml
      qml/LocalDocsSettings.qml
      qml/LocalDocsView.qml
      qml/ModelSettings.qml
      qml/ModelsView.qml
      qml/NetworkDialog.qml
      qml/NewVersionDialog.qml
      qml/PopupDialog.qml
      qml/SettingsView.qml
      qml/StartupDialog.qml
      qml/SwitchModelDialog.qml
      qml/Theme.qml
      qml/ThumbsDownDialog.qml
      qml/Toast.qml
      qml/ToastManager.qml
      qml/MyBusyIndicator.qml
      qml/MyButton.qml
      qml/MyCheckBox.qml
      qml/MyComboBox.qml
      qml/MyDialog.qml
      qml/MyDirectoryField.qml
      qml/MyFancyLink.qml
      qml/MyMenu.qml
      qml/MyMenuItem.qml
      qml/MyMiniButton.qml
      qml/MySettingsButton.qml
      qml/MySettingsDestructiveButton.qml
      qml/MySettingsLabel.qml
      qml/MySettingsStack.qml
      qml/MySettingsTab.qml
      qml/MySlug.qml
      qml/MyTextArea.qml
      qml/MyTextButton.qml
      qml/MyTextField.qml
      qml/MyToolButton.qml
      qml/MyWelcomeButton.qml
 )
--- a/gpt4all-chat/src/chat.cpp
+++ b/gpt4all-chat/src/chat.cpp
@ -62,7 +62,7 @@ void Chat::connectLLM()
    connect(m_llmodel, &ChatLLM::responseStopped, this, &Chat::responseStopped, Qt::QueuedConnection);
    connect(m_llmodel, &ChatLLM::modelLoadingError, this, &Chat::handleModelLoadingError, Qt::QueuedConnection);
    connect(m_llmodel, &ChatLLM::modelLoadingWarning, this, &Chat::modelLoadingWarning, Qt::QueuedConnection);
-    connect(m_llmodel, &ChatLLM::recalcChanged, this, &Chat::handleRecalculating, Qt::QueuedConnection);
+    connect(m_llmodel, &ChatLLM::restoringFromTextChanged, this, &Chat::handleRestoringFromText, Qt::QueuedConnection);
    connect(m_llmodel, &ChatLLM::generatedNameChanged, this, &Chat::generatedNameChanged, Qt::QueuedConnection);
    connect(m_llmodel, &ChatLLM::generatedQuestionFinished, this, &Chat::generatedQuestionFinished, Qt::QueuedConnection);
    connect(m_llmodel, &ChatLLM::reportSpeed, this, &Chat::handleTokenSpeedChanged, Qt::QueuedConnection);
@ -74,7 +74,6 @@ void Chat::connectLLM()
    connect(this, &Chat::promptRequested, m_llmodel, &ChatLLM::prompt, Qt::QueuedConnection);
    connect(this, &Chat::modelChangeRequested, m_llmodel, &ChatLLM::modelChangeRequested, Qt::QueuedConnection);
    connect(this, &Chat::loadDefaultModelRequested, m_llmodel, &ChatLLM::loadDefaultModel, Qt::QueuedConnection);
    connect(this, &Chat::loadModelRequested, m_llmodel, &ChatLLM::loadModel, Qt::QueuedConnection);
    connect(this, &Chat::generateNameRequested, m_llmodel, &ChatLLM::generateName, Qt::QueuedConnection);
    connect(this, &Chat::regenerateResponseRequested, m_llmodel, &ChatLLM::regenerateResponse, Qt::QueuedConnection);
    connect(this, &Chat::resetResponseRequested, m_llmodel, &ChatLLM::resetResponse, Qt::QueuedConnection);
@ -240,21 +239,22 @@ void Chat::newPromptResponsePair(const QString &prompt)
    resetResponseState();
    m_chatModel->updateCurrentResponse(m_chatModel->count() - 1, false);
    m_chatModel->appendPrompt("Prompt: ", prompt);
-    m_chatModel->appendResponse("Response: ", prompt);
+    m_chatModel->appendResponse("Response: ", QString());
    emit resetResponseRequested();
 }
 // the server needs to block until response is reset, so it calls resetResponse on its own m_llmThread
 void Chat::serverNewPromptResponsePair(const QString &prompt)
 {
    resetResponseState();
    m_chatModel->updateCurrentResponse(m_chatModel->count() - 1, false);
    m_chatModel->appendPrompt("Prompt: ", prompt);
-    m_chatModel->appendResponse("Response: ", prompt);
+    m_chatModel->appendResponse("Response: ", QString());
 }
-bool Chat::isRecalc() const
+bool Chat::restoringFromText() const
 {
-    return m_llmodel->isRecalc();
+    return m_llmodel->restoringFromText();
 }
 void Chat::unloadAndDeleteLater()
@ -320,10 +320,10 @@ void Chat::generatedQuestionFinished(const QString &question)
    emit generatedQuestionsChanged();
 }
-void Chat::handleRecalculating()
+void Chat::handleRestoringFromText()
 {
    Network::globalInstance()->trackChatEvent("recalc_context", { {"length", m_chatModel->count()} });
-    emit recalcChanged();
+    emit restoringFromTextChanged();
 }
 void Chat::handleModelLoadingError(const QString &error)
--- a/gpt4all-chat/src/chat.h
+++ b/gpt4all-chat/src/chat.h
@ -27,7 +27,7 @@ class Chat : public QObject
    Q_PROPERTY(QString response READ response NOTIFY responseChanged)
    Q_PROPERTY(ModelInfo modelInfo READ modelInfo WRITE setModelInfo NOTIFY modelInfoChanged)
    Q_PROPERTY(bool responseInProgress READ responseInProgress NOTIFY responseInProgressChanged)
-    Q_PROPERTY(bool isRecalc READ isRecalc NOTIFY recalcChanged)
+    Q_PROPERTY(bool restoringFromText READ restoringFromText NOTIFY restoringFromTextChanged)
    Q_PROPERTY(bool isServer READ isServer NOTIFY isServerChanged)
    Q_PROPERTY(ResponseState responseState READ responseState NOTIFY responseStateChanged)
    Q_PROPERTY(QList<QString> collectionList READ collectionList NOTIFY collectionListChanged)
@ -88,7 +88,7 @@ public:
    ResponseState responseState() const;
    ModelInfo modelInfo() const;
    void setModelInfo(const ModelInfo &modelInfo);
-    bool isRecalc() const;
+    bool restoringFromText() const;
    Q_INVOKABLE void unloadModel();
    Q_INVOKABLE void reloadModel();
@ -144,9 +144,8 @@ Q_SIGNALS:
    void processSystemPromptRequested();
    void modelChangeRequested(const ModelInfo &modelInfo);
    void modelInfoChanged();
-    void recalcChanged();
+    void restoringFromTextChanged();
    void loadDefaultModelRequested();
    void loadModelRequested(const ModelInfo &modelInfo);
    void generateNameRequested();
    void modelLoadingErrorChanged();
    void isServerChanged();
@ -167,7 +166,7 @@ private Q_SLOTS:
    void responseStopped(qint64 promptResponseMs);
    void generatedNameChanged(const QString &name);
    void generatedQuestionFinished(const QString &question);
-    void handleRecalculating();
+    void handleRestoringFromText();
    void handleModelLoadingError(const QString &error);
    void handleTokenSpeedChanged(const QString &tokenSpeed);
    void handleDatabaseResultsChanged(const QList<ResultInfo> &results);
--- a/gpt4all-chat/src/chatapi.cpp
+++ b/gpt4all-chat/src/chatapi.cpp
@ -1,6 +1,6 @@
 #include "chatapi.h"
-#include "../gpt4all-backend/llmodel.h"
+#include <gpt4all-backend/llmodel.h>
 #include <QCoreApplication>
 #include <QGuiApplication>
@ -90,13 +90,13 @@ void ChatAPI::prompt(const std::string &prompt,
                     const std::string &promptTemplate,
                     std::function<bool(int32_t)> promptCallback,
                     std::function<bool(int32_t, const std::string&)> responseCallback,
-                     std::function<bool(bool)> recalculateCallback,
+                     bool allowContextShift,
                     PromptContext &promptCtx,
                     bool special,
-                     std::string *fakeReply) {
+                     std::optional<std::string_view> fakeReply) {
    Q_UNUSED(promptCallback);
-    Q_UNUSED(recalculateCallback);
+    Q_UNUSED(allowContextShift);
    Q_UNUSED(special);
    if (!isModelLoaded()) {
@ -121,7 +121,7 @@ void ChatAPI::prompt(const std::string &prompt,
    if (fakeReply) {
        promptCtx.n_past += 1;
        m_context.append(formattedPrompt);
-        m_context.append(QString::fromStdString(*fakeReply));
+        m_context.append(QString::fromUtf8(fakeReply->data(), fakeReply->size()));
        return;
    }
--- a/gpt4all-chat/src/chatapi.h
+++ b/gpt4all-chat/src/chatapi.h
@ -1,7 +1,7 @@
 #ifndef CHATAPI_H
 #define CHATAPI_H
-#include "../gpt4all-backend/llmodel.h"
+#include <gpt4all-backend/llmodel.h>
 #include <QByteArray>
 #include <QNetworkReply>
@ -12,9 +12,10 @@
 #include <cstddef>
 #include <cstdint>
 #include <stdexcept>
 #include <functional>
 #include <stdexcept>
 #include <string>
 #include <string_view>
 #include <vector>
 class QNetworkAccessManager;
@ -69,10 +70,10 @@ public:
                const std::string &promptTemplate,
                std::function<bool(int32_t)> promptCallback,
                std::function<bool(int32_t, const std::string&)> responseCallback,
-                std::function<bool(bool)> recalculateCallback,
+                bool allowContextShift,
                PromptContext &ctx,
                bool special,
-                std::string *fakeReply) override;
+                std::optional<std::string_view> fakeReply) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;
@ -97,38 +98,57 @@ protected:
    // them as they are only called from the default implementation of 'prompt' which we override and
    // completely replace
-    std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) override {
+    std::vector<Token> tokenize(PromptContext &ctx, std::string_view str, bool special) override
    {
        (void)ctx;
        (void)str;
        (void)special;
        throw std::logic_error("not implemented");
    }
-    std::string tokenToString(Token id) const override {
+    bool isSpecialToken(Token id) const override
    {
        (void)id;
        throw std::logic_error("not implemented");
    }
-    Token sampleToken(PromptContext &ctx) const override {
+    std::string tokenToString(Token id) const override
    {
        (void)id;
        throw std::logic_error("not implemented");
    }
    Token sampleToken(PromptContext &ctx) const override
    {
        (void)ctx;
        throw std::logic_error("not implemented");
    }
-    bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override {
+    bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override
    {
        (void)ctx;
        (void)tokens;
        throw std::logic_error("not implemented");
    }
-    int32_t contextLength() const override {
+    void shiftContext(PromptContext &promptCtx) override
    {
        (void)promptCtx;
        throw std::logic_error("not implemented");
    }
-    const std::vector<Token> &endTokens() const override {
+    int32_t contextLength() const override
    {
        throw std::logic_error("not implemented");
    }
-    bool shouldAddBOS() const override {
+    const std::vector<Token> &endTokens() const override
    {
        throw std::logic_error("not implemented");
    }
    bool shouldAddBOS() const override
    {
        throw std::logic_error("not implemented");
    }
--- a/gpt4all-chat/src/chatlistmodel.cpp
+++ b/gpt4all-chat/src/chatlistmodel.cpp
--- a/gpt4all-chat/src/chatlistmodel.h
+++ b/gpt4all-chat/src/chatlistmodel.h
--- a/gpt4all-chat/src/chatllm.cpp
+++ b/gpt4all-chat/src/chatllm.cpp
@ -102,7 +102,7 @@ ChatLLM::ChatLLM(Chat *parent, bool isServer)
    : QObject{nullptr}
    , m_promptResponseTokens(0)
    , m_promptTokens(0)
-    , m_isRecalc(false)
+    , m_restoringFromText(false)
    , m_shouldBeLoaded(false)
    , m_forceUnloadModel(false)
    , m_markedForDeletion(false)
@ -249,9 +249,11 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
    // and what the type and name of that model is. I've tried to comment extensively in this method
    // to provide an overview of what we're doing here.
-    // We're already loaded with this model
+    if (isModelLoaded() && this->modelInfo() == modelInfo) {
-    if (isModelLoaded() && this->modelInfo() == modelInfo)
+        // already acquired -> keep it and reset
-        return true;
+        resetContext();
        return true; // already loaded
    }
    // reset status
    emit modelLoadingPercentageChanged(std::numeric_limits<float>::min()); // small non-zero positive value
@ -624,16 +626,16 @@ void ChatLLM::regenerateResponse()
    m_ctx.tokens.erase(m_ctx.tokens.end() - m_promptResponseTokens, m_ctx.tokens.end());
    m_promptResponseTokens = 0;
    m_promptTokens = 0;
-    m_response = std::string();
+    m_response = m_trimmedResponse = std::string();
-    emit responseChanged(QString::fromStdString(m_response));
+    emit responseChanged(QString::fromStdString(m_trimmedResponse));
 }
 void ChatLLM::resetResponse()
 {
    m_promptTokens = 0;
    m_promptResponseTokens = 0;
-    m_response = std::string();
+    m_response = m_trimmedResponse = std::string();
-    emit responseChanged(QString::fromStdString(m_response));
+    emit responseChanged(QString::fromStdString(m_trimmedResponse));
 }
 void ChatLLM::resetContext()
@ -643,9 +645,12 @@ void ChatLLM::resetContext()
    m_ctx = LLModel::PromptContext();
 }
-QString ChatLLM::response() const
+QString ChatLLM::response(bool trim) const
 {
-    return QString::fromStdString(remove_leading_whitespace(m_response));
+    std::string resp = m_response;
    if (trim)
        resp = remove_leading_whitespace(resp);
    return QString::fromStdString(resp);
 }
 ModelInfo ChatLLM::modelInfo() const
@ -659,21 +664,26 @@ void ChatLLM::setModelInfo(const ModelInfo &modelInfo)
    emit modelInfoChanged(modelInfo);
 }
-void ChatLLM::acquireModel() {
+void ChatLLM::acquireModel()
 {
    m_llModelInfo = LLModelStore::globalInstance()->acquireModel();
    emit loadedModelInfoChanged();
 }
-void ChatLLM::resetModel() {
+void ChatLLM::resetModel()
 {
    m_llModelInfo = {};
    emit loadedModelInfoChanged();
 }
 void ChatLLM::modelChangeRequested(const ModelInfo &modelInfo)
 {
    // ignore attempts to switch to the same model twice
    if (!isModelLoaded() || this->modelInfo() != modelInfo) {
        m_shouldBeLoaded = true;
        loadModel(modelInfo);
    }
 }
 bool ChatLLM::handlePrompt(int32_t token)
 {
@ -698,7 +708,8 @@ bool ChatLLM::handleResponse(int32_t token, const std::string &response)
    // check for error
    if (token < 0) {
        m_response.append(response);
-        emit responseChanged(QString::fromStdString(remove_leading_whitespace(m_response)));
+        m_trimmedResponse = remove_leading_whitespace(m_response);
        emit responseChanged(QString::fromStdString(m_trimmedResponse));
        return false;
    }
@ -708,21 +719,11 @@ bool ChatLLM::handleResponse(int32_t token, const std::string &response)
    m_timer->inc();
    Q_ASSERT(!response.empty());
    m_response.append(response);
-    emit responseChanged(QString::fromStdString(remove_leading_whitespace(m_response)));
+    m_trimmedResponse = remove_leading_whitespace(m_response);
    emit responseChanged(QString::fromStdString(m_trimmedResponse));
    return !m_stopGenerating;
 }
 bool ChatLLM::handleRecalculate(bool isRecalc)
 {
 #if defined(DEBUG)
    qDebug() << "recalculate" << m_llmThread.objectName() << isRecalc;
 #endif
    if (m_isRecalc != isRecalc) {
        m_isRecalc = isRecalc;
        emit recalcChanged();
    }
    return !m_stopGenerating;
 }
 bool ChatLLM::prompt(const QList<QString> &collectionList, const QString &prompt)
 {
    if (m_restoreStateFromText) {
@ -730,8 +731,6 @@ bool ChatLLM::prompt(const QList<QString> &collectionList, const QString &prompt
        processRestoreStateFromText();
    }
    if (!m_processedSystemPrompt)
        processSystemPrompt();
    const QString promptTemplate = MySettings::globalInstance()->modelPromptTemplate(m_modelInfo);
    const int32_t n_predict = MySettings::globalInstance()->modelMaxLength(m_modelInfo);
    const int32_t top_k = MySettings::globalInstance()->modelTopK(m_modelInfo);
@ -747,14 +746,17 @@ bool ChatLLM::prompt(const QList<QString> &collectionList, const QString &prompt
 bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString &prompt, const QString &promptTemplate,
    int32_t n_predict, int32_t top_k, float top_p, float min_p, float temp, int32_t n_batch, float repeat_penalty,
-    int32_t repeat_penalty_tokens)
+    int32_t repeat_penalty_tokens, std::optional<QString> fakeReply)
 {
    if (!isModelLoaded())
        return false;
    if (!m_processedSystemPrompt)
        processSystemPrompt();
    QList<ResultInfo> databaseResults;
    const int retrievalSize = MySettings::globalInstance()->localDocsRetrievalSize();
-    if (!collectionList.isEmpty()) {
+    if (!fakeReply && !collectionList.isEmpty()) {
        emit requestRetrieveFromDB(collectionList, prompt, retrievalSize, &databaseResults); // blocks
        emit databaseResultsChanged(databaseResults);
    }
@ -776,7 +778,6 @@ bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString
    auto promptFunc = std::bind(&ChatLLM::handlePrompt, this, std::placeholders::_1);
    auto responseFunc = std::bind(&ChatLLM::handleResponse, this, std::placeholders::_1,
        std::placeholders::_2);
    auto recalcFunc = std::bind(&ChatLLM::handleRecalculate, this, std::placeholders::_1);
    emit promptProcessing();
    m_ctx.n_predict = n_predict;
    m_ctx.top_k = top_k;
@ -796,10 +797,13 @@ bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString
    m_timer->start();
    if (!docsContext.isEmpty()) {
        auto old_n_predict = std::exchange(m_ctx.n_predict, 0); // decode localdocs context without a response
-        m_llModelInfo.model->prompt(docsContext.toStdString(), "%1", promptFunc, responseFunc, recalcFunc, m_ctx);
+        m_llModelInfo.model->prompt(docsContext.toStdString(), "%1", promptFunc, responseFunc,
                                    /*allowContextShift*/ true, m_ctx);
        m_ctx.n_predict = old_n_predict; // now we are ready for a response
    }
-    m_llModelInfo.model->prompt(prompt.toStdString(), promptTemplate.toStdString(), promptFunc, responseFunc, recalcFunc, m_ctx);
+    m_llModelInfo.model->prompt(prompt.toStdString(), promptTemplate.toStdString(), promptFunc, responseFunc,
                                /*allowContextShift*/ true, m_ctx, false,
                                fakeReply.transform(std::mem_fn(&QString::toStdString)));
 #if defined(DEBUG)
    printf("\n");
    fflush(stdout);
@ -807,9 +811,9 @@ bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString
    m_timer->stop();
    qint64 elapsed = totalTime.elapsed();
    std::string trimmed = trim_whitespace(m_response);
-    if (trimmed != m_response) {
+    if (trimmed != m_trimmedResponse) {
-        m_response = trimmed;
+        m_trimmedResponse = trimmed;
-        emit responseChanged(QString::fromStdString(m_response));
+        emit responseChanged(QString::fromStdString(m_trimmedResponse));
    }
    SuggestionMode mode = MySettings::globalInstance()->suggestionMode();
@ -904,10 +908,9 @@ void ChatLLM::generateName()
    auto promptTemplate = MySettings::globalInstance()->modelPromptTemplate(m_modelInfo);
    auto promptFunc = std::bind(&ChatLLM::handleNamePrompt, this, std::placeholders::_1);
    auto responseFunc = std::bind(&ChatLLM::handleNameResponse, this, std::placeholders::_1, std::placeholders::_2);
    auto recalcFunc = std::bind(&ChatLLM::handleNameRecalculate, this, std::placeholders::_1);
    LLModel::PromptContext ctx = m_ctx;
    m_llModelInfo.model->prompt(chatNamePrompt.toStdString(), promptTemplate.toStdString(),
-                                promptFunc, responseFunc, recalcFunc, ctx);
+                                promptFunc, responseFunc, /*allowContextShift*/ false, ctx);
    std::string trimmed = trim_whitespace(m_nameResponse);
    if (trimmed != m_nameResponse) {
        m_nameResponse = trimmed;
@ -944,15 +947,6 @@ bool ChatLLM::handleNameResponse(int32_t token, const std::string &response)
    return words.size() <= 3;
 }
 bool ChatLLM::handleNameRecalculate(bool isRecalc)
 {
 #if defined(DEBUG)
    qDebug() << "name recalc" << m_llmThread.objectName() << isRecalc;
 #endif
    Q_UNUSED(isRecalc);
    return true;
 }
 bool ChatLLM::handleQuestionPrompt(int32_t token)
 {
 #if defined(DEBUG)
@ -991,15 +985,6 @@ bool ChatLLM::handleQuestionResponse(int32_t token, const std::string &response)
    return true;
 }
 bool ChatLLM::handleQuestionRecalculate(bool isRecalc)
 {
 #if defined(DEBUG)
    qDebug() << "name recalc" << m_llmThread.objectName() << isRecalc;
 #endif
    Q_UNUSED(isRecalc);
    return true;
 }
 void ChatLLM::generateQuestions(qint64 elapsed)
 {
    Q_ASSERT(isModelLoaded());
@ -1019,12 +1004,11 @@ void ChatLLM::generateQuestions(qint64 elapsed)
    auto promptTemplate = MySettings::globalInstance()->modelPromptTemplate(m_modelInfo);
    auto promptFunc = std::bind(&ChatLLM::handleQuestionPrompt, this, std::placeholders::_1);
    auto responseFunc = std::bind(&ChatLLM::handleQuestionResponse, this, std::placeholders::_1, std::placeholders::_2);
    auto recalcFunc = std::bind(&ChatLLM::handleQuestionRecalculate, this, std::placeholders::_1);
    LLModel::PromptContext ctx = m_ctx;
    QElapsedTimer totalTime;
    totalTime.start();
-    m_llModelInfo.model->prompt(suggestedFollowUpPrompt,
+    m_llModelInfo.model->prompt(suggestedFollowUpPrompt, promptTemplate.toStdString(), promptFunc, responseFunc,
-                                promptTemplate.toStdString(), promptFunc, responseFunc, recalcFunc, ctx);
+                                /*allowContextShift*/ false, ctx);
    elapsed += totalTime.elapsed();
    emit responseStopped(elapsed);
 }
@ -1039,15 +1023,6 @@ bool ChatLLM::handleSystemPrompt(int32_t token)
    return !m_stopGenerating;
 }
 bool ChatLLM::handleSystemRecalculate(bool isRecalc)
 {
 #if defined(DEBUG)
    qDebug() << "system recalc" << m_llmThread.objectName() << isRecalc;
 #endif
    Q_UNUSED(isRecalc);
    return false;
 }
 bool ChatLLM::handleRestoreStateFromTextPrompt(int32_t token)
 {
 #if defined(DEBUG)
@ -1057,15 +1032,6 @@ bool ChatLLM::handleRestoreStateFromTextPrompt(int32_t token)
    return !m_stopGenerating;
 }
 bool ChatLLM::handleRestoreStateFromTextRecalculate(bool isRecalc)
 {
 #if defined(DEBUG)
    qDebug() << "restore state from text recalc" << m_llmThread.objectName() << isRecalc;
 #endif
    Q_UNUSED(isRecalc);
    return false;
 }
 // this function serialized the cached model state to disk.
 // we want to also serialize n_ctx, and read it at load time.
 bool ChatLLM::serialize(QDataStream &stream, int version, bool serializeKV)
@ -1118,6 +1084,7 @@ bool ChatLLM::deserialize(QDataStream &stream, int version, bool deserializeKV,
    QString response;
    stream >> response;
    m_response = response.toStdString();
    m_trimmedResponse = trim_whitespace(m_response);
    QString nameResponse;
    stream >> nameResponse;
    m_nameResponse = nameResponse.toStdString();
@ -1254,7 +1221,7 @@ void ChatLLM::restoreState()
 void ChatLLM::processSystemPrompt()
 {
    Q_ASSERT(isModelLoaded());
-    if (!isModelLoaded() || m_processedSystemPrompt || m_restoreStateFromText || m_isServer)
+    if (!isModelLoaded() || m_processedSystemPrompt || m_restoreStateFromText)
        return;
    const std::string systemPrompt = MySettings::globalInstance()->modelSystemPrompt(m_modelInfo).toStdString();
@ -1268,7 +1235,6 @@ void ChatLLM::processSystemPrompt()
    m_ctx = LLModel::PromptContext();
    auto promptFunc = std::bind(&ChatLLM::handleSystemPrompt, this, std::placeholders::_1);
    auto recalcFunc = std::bind(&ChatLLM::handleSystemRecalculate, this, std::placeholders::_1);
    const int32_t n_predict = MySettings::globalInstance()->modelMaxLength(m_modelInfo);
    const int32_t top_k = MySettings::globalInstance()->modelTopK(m_modelInfo);
@ -1294,7 +1260,7 @@ void ChatLLM::processSystemPrompt()
 #endif
    auto old_n_predict = std::exchange(m_ctx.n_predict, 0); // decode system prompt without a response
    // use "%1%2" and not "%1" to avoid implicit whitespace
-    m_llModelInfo.model->prompt(systemPrompt, "%1%2", promptFunc, nullptr, recalcFunc, m_ctx, true);
+    m_llModelInfo.model->prompt(systemPrompt, "%1%2", promptFunc, nullptr, /*allowContextShift*/ true, m_ctx, true);
    m_ctx.n_predict = old_n_predict;
 #if defined(DEBUG)
    printf("\n");
@ -1311,14 +1277,13 @@ void ChatLLM::processRestoreStateFromText()
    if (!isModelLoaded() || !m_restoreStateFromText || m_isServer)
        return;
-    m_isRecalc = true;
+    m_restoringFromText = true;
-    emit recalcChanged();
+    emit restoringFromTextChanged();
    m_stopGenerating = false;
    m_ctx = LLModel::PromptContext();
    auto promptFunc = std::bind(&ChatLLM::handleRestoreStateFromTextPrompt, this, std::placeholders::_1);
    auto recalcFunc = std::bind(&ChatLLM::handleRestoreStateFromTextRecalculate, this, std::placeholders::_1);
    const QString promptTemplate = MySettings::globalInstance()->modelPromptTemplate(m_modelInfo);
    const int32_t n_predict = MySettings::globalInstance()->modelMaxLength(m_modelInfo);
@ -1348,10 +1313,9 @@ void ChatLLM::processRestoreStateFromText()
        auto &response = *it++;
        Q_ASSERT(response.first != "Prompt: ");
        auto responseText = response.second.toStdString();
        m_llModelInfo.model->prompt(prompt.second.toStdString(), promptTemplate.toStdString(), promptFunc, nullptr,
-                                    recalcFunc, m_ctx, false, &responseText);
+                                    /*allowContextShift*/ true, m_ctx, false, response.second.toUtf8().constData());
    }
    if (!m_stopGenerating) {
@ -1359,8 +1323,8 @@ void ChatLLM::processRestoreStateFromText()
        m_stateFromText.clear();
    }
-    m_isRecalc = false;
+    m_restoringFromText = false;
-    emit recalcChanged();
+    emit restoringFromTextChanged();
    m_pristineLoadedState = false;
 }
--- a/gpt4all-chat/src/chatllm.h
+++ b/gpt4all-chat/src/chatllm.h
@ -4,7 +4,7 @@
 #include "database.h" // IWYU pragma: keep
 #include "modellist.h"
-#include "../gpt4all-backend/llmodel.h"
+#include <gpt4all-backend/llmodel.h>
 #include <QByteArray>
 #include <QElapsedTimer>
@ -93,7 +93,7 @@ class Chat;
 class ChatLLM : public QObject
 {
    Q_OBJECT
-    Q_PROPERTY(bool isRecalc READ isRecalc NOTIFY recalcChanged)
+    Q_PROPERTY(bool restoringFromText READ restoringFromText NOTIFY restoringFromTextChanged)
    Q_PROPERTY(QString deviceBackend READ deviceBackend NOTIFY loadedModelInfoChanged)
    Q_PROPERTY(QString device READ device NOTIFY loadedModelInfoChanged)
    Q_PROPERTY(QString fallbackReason READ fallbackReason NOTIFY loadedModelInfoChanged)
@ -116,12 +116,12 @@ public:
    void setForceUnloadModel(bool b) { m_forceUnloadModel = b; }
    void setMarkedForDeletion(bool b) { m_markedForDeletion = b; }
-    QString response() const;
+    QString response(bool trim = true) const;
    ModelInfo modelInfo() const;
    void setModelInfo(const ModelInfo &info);
-    bool isRecalc() const { return m_isRecalc; }
+    bool restoringFromText() const { return m_restoringFromText; }
    void acquireModel();
    void resetModel();
@ -172,7 +172,7 @@ public Q_SLOTS:
    void processRestoreStateFromText();
 Q_SIGNALS:
-    void recalcChanged();
+    void restoringFromTextChanged();
    void loadedModelInfoChanged();
    void modelLoadingPercentageChanged(float);
    void modelLoadingError(const QString &error);
@ -198,22 +198,17 @@ Q_SIGNALS:
 protected:
    bool promptInternal(const QList<QString> &collectionList, const QString &prompt, const QString &promptTemplate,
        int32_t n_predict, int32_t top_k, float top_p, float min_p, float temp, int32_t n_batch, float repeat_penalty,
-        int32_t repeat_penalty_tokens);
+        int32_t repeat_penalty_tokens, std::optional<QString> fakeReply = {});
    bool handlePrompt(int32_t token);
    bool handleResponse(int32_t token, const std::string &response);
    bool handleRecalculate(bool isRecalc);
    bool handleNamePrompt(int32_t token);
    bool handleNameResponse(int32_t token, const std::string &response);
    bool handleNameRecalculate(bool isRecalc);
    bool handleSystemPrompt(int32_t token);
    bool handleSystemResponse(int32_t token, const std::string &response);
    bool handleSystemRecalculate(bool isRecalc);
    bool handleRestoreStateFromTextPrompt(int32_t token);
    bool handleRestoreStateFromTextResponse(int32_t token, const std::string &response);
    bool handleRestoreStateFromTextRecalculate(bool isRecalc);
    bool handleQuestionPrompt(int32_t token);
    bool handleQuestionResponse(int32_t token, const std::string &response);
    bool handleQuestionRecalculate(bool isRecalc);
    void saveState();
    void restoreState();
@ -226,6 +221,7 @@ private:
    bool loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadProps);
    std::string m_response;
    std::string m_trimmedResponse;
    std::string m_nameResponse;
    QString m_questionResponse;
    LLModelInfo m_llModelInfo;
@ -236,7 +232,7 @@ private:
    QThread m_llmThread;
    std::atomic<bool> m_stopGenerating;
    std::atomic<bool> m_shouldBeLoaded;
-    std::atomic<bool> m_isRecalc;
+    std::atomic<bool> m_restoringFromText; // status indication
    std::atomic<bool> m_forceUnloadModel;
    std::atomic<bool> m_markedForDeletion;
    bool m_isServer;
--- a/gpt4all-chat/src/chatmodel.h
+++ b/gpt4all-chat/src/chatmodel.h
--- a/gpt4all-chat/src/chatviewtextprocessor.cpp
+++ b/gpt4all-chat/src/chatviewtextprocessor.cpp
--- a/gpt4all-chat/src/chatviewtextprocessor.h
+++ b/gpt4all-chat/src/chatviewtextprocessor.h
--- a/gpt4all-chat/src/database.cpp
+++ b/gpt4all-chat/src/database.cpp
--- a/gpt4all-chat/src/database.h
+++ b/gpt4all-chat/src/database.h
--- a/gpt4all-chat/src/download.cpp
+++ b/gpt4all-chat/src/download.cpp
--- a/gpt4all-chat/src/download.h
+++ b/gpt4all-chat/src/download.h
--- a/gpt4all-chat/src/embllm.cpp
+++ b/gpt4all-chat/src/embllm.cpp
@ -3,7 +3,7 @@
 #include "modellist.h"
 #include "mysettings.h"
-#include "../gpt4all-backend/llmodel.h"
+#include <gpt4all-backend/llmodel.h>
 #include <QCoreApplication>
 #include <QDebug>
--- a/gpt4all-chat/src/embllm.h
+++ b/gpt4all-chat/src/embllm.h
--- a/gpt4all-chat/src/llm.cpp
+++ b/gpt4all-chat/src/llm.cpp
@ -1,7 +1,7 @@
 #include "llm.h"
-#include "../gpt4all-backend/llmodel.h"
+#include <gpt4all-backend/llmodel.h>
-#include "../gpt4all-backend/sysinfo.h"
+#include <gpt4all-backend/sysinfo.h>
 #include <QCoreApplication>
 #include <QDebug>
@ -51,7 +51,7 @@ bool LLM::checkForUpdates() const
 {
 #ifdef GPT4ALL_OFFLINE_INSTALLER
 #   pragma message(__FILE__ ": WARNING: offline installer build will not check for updates!")
-    return QDesktopServices::openUrl(QUrl("https://gpt4all.io/"));
+    return QDesktopServices::openUrl(QUrl("https://github.com/nomic-ai/gpt4all/releases"));
 #else
    Network::globalInstance()->trackEvent("check_for_updates");
--- a/gpt4all-chat/src/llm.h
+++ b/gpt4all-chat/src/llm.h
--- a/gpt4all-chat/src/localdocs.cpp
+++ b/gpt4all-chat/src/localdocs.cpp
--- a/gpt4all-chat/src/localdocs.h
+++ b/gpt4all-chat/src/localdocs.h
--- a/gpt4all-chat/src/localdocsmodel.cpp
+++ b/gpt4all-chat/src/localdocsmodel.cpp
--- a/gpt4all-chat/src/localdocsmodel.h
+++ b/gpt4all-chat/src/localdocsmodel.h
@ -20,24 +20,25 @@ class LocalDocsCollectionsModel : public QSortFilterProxyModel
    Q_OBJECT
    Q_PROPERTY(int count READ count NOTIFY countChanged)
    Q_PROPERTY(int updatingCount READ updatingCount NOTIFY updatingCountChanged)
 public:
    explicit LocalDocsCollectionsModel(QObject *parent);
    int count() const { return rowCount(); }
    int updatingCount() const;
 public Q_SLOTS:
    int count() const { return rowCount(); }
    void setCollections(const QList<QString> &collections);
    int updatingCount() const;
 Q_SIGNALS:
    void countChanged();
    void updatingCountChanged();
 private Q_SLOT:
    void maybeTriggerUpdatingCountChanged();
 protected:
    bool filterAcceptsRow(int sourceRow, const QModelIndex &sourceParent) const override;
 private Q_SLOTS:
    void maybeTriggerUpdatingCountChanged();
 private:
    QList<QString> m_collections;
    int m_updatingCount = 0;
--- a/gpt4all-chat/src/logger.cpp
+++ b/gpt4all-chat/src/logger.cpp
--- a/gpt4all-chat/src/logger.h
+++ b/gpt4all-chat/src/logger.h
--- a/gpt4all-chat/src/main.cpp
+++ b/gpt4all-chat/src/main.cpp
@ -8,19 +8,46 @@
 #include "mysettings.h"
 #include "network.h"
-#include "../gpt4all-backend/llmodel.h"
+#include <gpt4all-backend/llmodel.h>
 #include <singleapplication.h>
 #include <QCoreApplication>
 #include <QGuiApplication>
 #include <QObject>
 #include <QQmlApplicationEngine>
-#include <QQmlEngine>
+#include <QQuickWindow>
 #include <QSettings>
 #include <QString>
 #include <QTranslator>
 #include <QUrl>
 #include <Qt>
 #ifdef Q_OS_LINUX
 #   include <QIcon>
 #endif
 #ifdef Q_OS_WINDOWS
 #   include <windows.h>
 #endif
 using namespace Qt::Literals::StringLiterals;
 static void raiseWindow(QWindow *window)
 {
 #ifdef Q_OS_WINDOWS
    HWND hwnd = HWND(window->winId());
    // check if window is minimized to Windows task bar
    if (IsIconic(hwnd))
        ShowWindow(hwnd, SW_RESTORE);
    SetForegroundWindow(hwnd);
 #else
    window->show();
    window->raise();
    window->requestActivate();
 #endif
 }
 int main(int argc, char *argv[])
 {
    QCoreApplication::setOrganizationName("nomic.ai");
@ -31,7 +58,18 @@ int main(int argc, char *argv[])
    Logger::globalInstance();
-    QGuiApplication app(argc, argv);
+    SingleApplication app(argc, argv, true /*allowSecondary*/);
    if (app.isSecondary()) {
 #ifdef Q_OS_WINDOWS
        AllowSetForegroundWindow(DWORD(app.primaryPid()));
 #endif
        app.sendMessage("RAISE_WINDOW");
        return 0;
    }
 #ifdef Q_OS_LINUX
    app.setWindowIcon(QIcon(":/gpt4all/icons/gpt4all.svg"));
 #endif
    // set search path before constructing the MySettings instance, which relies on this
    QString llmodelSearchPaths = QCoreApplication::applicationDirPath();
@ -69,7 +107,7 @@ int main(int argc, char *argv[])
    qmlRegisterSingletonInstance("localdocs", 1, 0, "LocalDocs", LocalDocs::globalInstance());
    qmlRegisterUncreatableMetaObject(MySettingsEnums::staticMetaObject, "mysettingsenums", 1, 0, "MySettingsEnums", "Error: only enums");
-    const QUrl url(u"qrc:/gpt4all/main.qml"_qs);
+    const QUrl url(u"qrc:/gpt4all/main.qml"_s);
    QObject::connect(&engine, &QQmlApplicationEngine::objectCreated,
        &app, [url](QObject *obj, const QUrl &objUrl) {
@ -78,6 +116,13 @@ int main(int argc, char *argv[])
        }, Qt::QueuedConnection);
    engine.load(url);
    QObject *rootObject = engine.rootObjects().first();
    QQuickWindow *windowObject = qobject_cast<QQuickWindow *>(rootObject);
    Q_ASSERT(windowObject);
    if (windowObject)
        QObject::connect(&app, &SingleApplication::receivedMessage,
                         windowObject, [windowObject] () { raiseWindow(windowObject); } );
 #if 0
    QDirIterator it("qrc:", QDirIterator::Subdirectories);
    while (it.hasNext()) {
--- a/gpt4all-chat/src/main.qml
+++ b/gpt4all-chat/src/main.qml
@ -15,10 +15,10 @@ import mysettings
 Window {
    id: window
-    width: 1920
+    width: 1440
-    height: 1080
+    height: 810
-    minimumWidth: 1280
+    minimumWidth: 658 + 470 * theme.fontScale
-    minimumHeight: 720
+    minimumHeight: 384 + 160 * theme.fontScale
    visible: true
    title: qsTr("GPT4All v%1").arg(Qt.application.version)
@ -422,7 +422,7 @@ Window {
                            return qsTr("The datalake is enabled")
                        else if (currentChat.modelInfo.isOnline)
                            return qsTr("Using a network model")
-                        else if (currentChat.modelInfo.isOnline)
+                        else if (currentChat.isServer)
                            return qsTr("Server mode is enabled")
                        return ""
                    }
--- a/gpt4all-chat/src/modellist.cpp
+++ b/gpt4all-chat/src/modellist.cpp
@ -4,7 +4,7 @@
 #include "mysettings.h"
 #include "network.h"
-#include "../gpt4all-backend/llmodel.h"
+#include <gpt4all-backend/llmodel.h>
 #include <QChar>
 #include <QCoreApplication>
@ -1208,51 +1208,55 @@ bool ModelList::modelExists(const QString &modelFilename) const
    return false;
 }
-void ModelList::updateModelsFromDirectory()
+void ModelList::updateOldRemoteModels(const QString &path)
 {
-    const QString exePath = QCoreApplication::applicationDirPath() + QDir::separator();
+    QDirIterator it(path, QDir::Files, QDirIterator::Subdirectories);
    const QString localPath = MySettings::globalInstance()->modelPath();
    auto updateOldRemoteModels = [&](const QString& path) {
        QDirIterator it(path, QDirIterator::Subdirectories);
    while (it.hasNext()) {
-            it.next();
+        QFileInfo info = it.nextFileInfo();
            if (!it.fileInfo().isDir()) {
        QString filename = it.fileName();
-                if (filename.startsWith("chatgpt-") && filename.endsWith(".txt")) {
+        if (!filename.startsWith("chatgpt-") || !filename.endsWith(".txt"))
            continue;
        QString apikey;
        QString modelname(filename);
        modelname.chop(4); // strip ".txt" extension
        modelname.remove(0, 8); // strip "chatgpt-" prefix
-                    QFile file(path + filename);
+        QFile file(info.filePath());
-                    if (file.open(QIODevice::ReadWrite)) {
+        if (!file.open(QIODevice::ReadOnly)) {
            qWarning().noquote() << tr("cannot open \"%1\": %2").arg(file.fileName(), file.errorString());
            continue;
        }
        {
            QTextStream in(&file);
            apikey = in.readAll();
            file.close();
        }
-                    QJsonObject obj;
+        QFile newfile(u"%1/gpt4all-%2.rmodel"_s.arg(info.dir().path(), modelname));
-                    obj.insert("apiKey", apikey);
+        if (!newfile.open(QIODevice::ReadWrite)) {
-                    obj.insert("modelName", modelname);
+            qWarning().noquote() << tr("cannot create \"%1\": %2").arg(newfile.fileName(), file.errorString());
-                    QJsonDocument doc(obj);
+            continue;
                    auto newfilename = u"gpt4all-%1.rmodel"_s.arg(modelname);
                    QFile newfile(path + newfilename);
                    if (newfile.open(QIODevice::ReadWrite)) {
                        QTextStream out(&newfile);
                        out << doc.toJson();
                        newfile.close();
        }
        QJsonObject obj {
            { "apiKey",    apikey    },
            { "modelName", modelname },
        };
        QTextStream out(&newfile);
        out << QJsonDocument(obj).toJson();
        newfile.close();
        file.remove();
    }
 }
        }
    };
-    auto processDirectory = [&](const QString& path) {
+void ModelList::processModelDirectory(const QString &path)
 {
    QDirIterator it(path, QDir::Files, QDirIterator::Subdirectories);
    while (it.hasNext()) {
-            it.next();
+        QFileInfo info = it.nextFileInfo();
        QString filename = it.fileName();
        if (filename.startsWith("incomplete") || FILENAME_BLACKLIST.contains(filename))
@ -1260,22 +1264,6 @@ void ModelList::updateModelsFromDirectory()
        if (!filename.endsWith(".gguf") && !filename.endsWith(".rmodel"))
            continue;
            QVector<QString> modelsById;
            {
                QMutexLocker locker(&m_mutex);
                for (ModelInfo *info : m_models)
                    if (info->filename() == filename)
                        modelsById.append(info->id());
            }
            if (modelsById.isEmpty()) {
                if (!contains(filename))
                    addModel(filename);
                modelsById.append(filename);
            }
            QFileInfo info = it.fileInfo();
        bool isOnline(filename.endsWith(".rmodel"));
        bool isCompatibleApi(filename.endsWith("-capi.rmodel"));
@ -1284,10 +1272,11 @@ void ModelList::updateModelsFromDirectory()
        if (isCompatibleApi) {
            QJsonObject obj;
            {
-                    QFile file(path + filename);
+                QFile file(info.filePath());
-                    bool success = file.open(QIODeviceBase::ReadOnly);
+                if (!file.open(QIODeviceBase::ReadOnly)) {
-                    (void)success;
+                    qWarning().noquote() << tr("cannot open \"%1\": %2").arg(file.fileName(), file.errorString());
-                    Q_ASSERT(success);
+                    continue;
                }
                QJsonDocument doc = QJsonDocument::fromJson(file.readAll());
                obj = doc.object();
            }
@ -1305,6 +1294,20 @@ void ModelList::updateModelsFromDirectory()
            }
        }
        QVector<QString> modelsById;
        {
            QMutexLocker locker(&m_mutex);
            for (ModelInfo *info : m_models)
                if (info->filename() == filename)
                    modelsById.append(info->id());
        }
        if (modelsById.isEmpty()) {
            if (!contains(filename))
                addModel(filename);
            modelsById.append(filename);
        }
        for (const QString &id : modelsById) {
            QVector<QPair<int, QVariant>> data {
                { InstalledRole, true },
@ -1326,14 +1329,18 @@ void ModelList::updateModelsFromDirectory()
            updateData(id, data);
        }
    }
-    };
+}
 void ModelList::updateModelsFromDirectory()
 {
    const QString exePath = QCoreApplication::applicationDirPath() + QDir::separator();
    const QString localPath = MySettings::globalInstance()->modelPath();
    updateOldRemoteModels(exePath);
-    processDirectory(exePath);
+    processModelDirectory(exePath);
    if (localPath != exePath) {
        updateOldRemoteModels(localPath);
-        processDirectory(localPath);
+        processModelDirectory(localPath);
    }
 }
--- a/gpt4all-chat/src/modellist.h
+++ b/gpt4all-chat/src/modellist.h
@ -18,10 +18,12 @@
 #include <QVector>
 #include <Qt>
 #include <QtGlobal>
-#include <QtQml>
+
 #include <utility>
 using namespace Qt::Literals::StringLiterals;
 struct ModelInfo {
    Q_GADGET
    Q_PROPERTY(QString id READ id WRITE setId)
@ -502,6 +504,8 @@ private:
    void parseModelsJsonFile(const QByteArray &jsonData, bool save);
    void parseDiscoveryJsonFile(const QByteArray &jsonData);
    QString uniqueModelName(const ModelInfo &model) const;
    void updateOldRemoteModels(const QString &path);
    void processModelDirectory(const QString &path);
 private:
    mutable QMutex m_mutex;
@ -521,7 +525,7 @@ private:
 protected:
    explicit ModelList();
-    ~ModelList() { for (auto *model: m_models) { delete model; } }
+    ~ModelList() override { for (auto *model: std::as_const(m_models)) { delete model; } }
    friend class MyModelList;
 };
--- a/gpt4all-chat/src/mysettings.cpp
+++ b/gpt4all-chat/src/mysettings.cpp
@ -1,6 +1,6 @@
 #include "mysettings.h"
-#include "../gpt4all-backend/llmodel.h"
+#include <gpt4all-backend/llmodel.h>
 #include <QDebug>
 #include <QDir>
@ -122,6 +122,7 @@ static QString getUiLanguage(const QString directory, const QString fileName)
 static QStringList getUiLanguages(const QString &modelPath)
 {
    QStringList languageList;
    static const QStringList releasedLanguages = { "en_US", "it_IT", "zh_CN", "zh_TW", "es_MX", "pt_BR", "ro_RO" };
    // Add the language translations from model path files first which is used by translation developers
    // to load translations in progress without having to rebuild all of GPT4All from source
@ -138,7 +139,7 @@ static QStringList getUiLanguages(const QString &modelPath)
        const QStringList qmFiles = dir.entryList({"*.qm"}, QDir::Files);
        for (const QString &fileName : qmFiles) {
            const QString lang = getUiLanguage(":/i18n", fileName);
-            if (!languageList.contains(lang))
+            if (!languageList.contains(lang) && releasedLanguages.contains(lang))
                languageList.append(lang);
        }
    }
@ -593,7 +594,7 @@ QString MySettings::languageAndLocale() const
 QString MySettings::filePathForLocale(const QLocale &locale)
 {
    // Check and see if we have a translation for the chosen locale and set it if possible otherwise
-    // we return the filepath for the 'en' translation
+    // we return the filepath for the 'en_US' translation
    QStringList uiLanguages = locale.uiLanguages();
    for (int i = 0; i < uiLanguages.size(); ++i)
        uiLanguages[i].replace('-', '_');
@ -604,18 +605,18 @@ QString MySettings::filePathForLocale(const QLocale &locale)
    // rather than having to recompile all of GPT4All
    QString directory = modelPath();
    for (const QString &bcp47Name : uiLanguages) {
-        QString filePath = QString("%1/gpt4all_%2.qm").arg(directory).arg(bcp47Name);
+        QString filePath = u"%1/gpt4all_%2.qm"_s.arg(directory, bcp47Name);
        QFileInfo filePathInfo(filePath);
        if (filePathInfo.exists()) return filePath;
    }
    // Now scan the internal built-in translations
    for (QString bcp47Name : uiLanguages) {
-        QString filePath = QString(":/i18n/gpt4all_%1.qm").arg(bcp47Name);
+        QString filePath = u":/i18n/gpt4all_%1.qm"_s.arg(bcp47Name);
        QFileInfo filePathInfo(filePath);
        if (filePathInfo.exists()) return filePath;
    }
-    return QString(":/i18n/gpt4all_en.qm");
+    return u":/i18n/gpt4all_en_US.qm"_s;
 }
 void MySettings::setLanguageAndLocale(const QString &bcp47Name)
@ -634,11 +635,10 @@ void MySettings::setLanguageAndLocale(const QString &bcp47Name)
    // If we previously installed a translator, then remove it
    if (m_translator) {
-        if (!qGuiApp->removeTranslator(m_translator)) {
+        if (!qGuiApp->removeTranslator(m_translator.get())) {
            qDebug() << "ERROR: Failed to remove the previous translator";
        } else {
-            delete m_translator;
+            m_translator.reset();
            m_translator = nullptr;
        }
    }
@ -646,24 +646,20 @@ void MySettings::setLanguageAndLocale(const QString &bcp47Name)
    Q_ASSERT(!m_translator);
    const QString filePath = filePathForLocale(locale);
-    // Installing the default gpt4all_en.qm fails presumably because it has no strings that are
+    if (!m_translator) {
    // different from the ones stored in the binary
    if (!m_translator && !filePath.endsWith("en.qm")) {
        // Create a new translator object on the heap
-        m_translator = new QTranslator(this);
+        m_translator = std::make_unique<QTranslator>(this);
        bool success = m_translator->load(filePath);
        Q_ASSERT(success);
        if (!success) {
            qDebug() << "ERROR: Failed to load translation file:" << filePath;
-            delete m_translator;
+            m_translator.reset();
            m_translator = nullptr;
        }
        // If we've successfully loaded it, then try and install it
-        if (!qGuiApp->installTranslator(m_translator)) {
+        if (!qGuiApp->installTranslator(m_translator.get())) {
            qDebug() << "ERROR: Failed to install the translator:" << filePath;
-            delete m_translator;
+            m_translator.reset();
            m_translator = nullptr;
        }
    }
--- a/gpt4all-chat/src/mysettings.h
+++ b/gpt4all-chat/src/mysettings.h
@ -8,9 +8,11 @@
 #include <QSettings>
 #include <QString>
 #include <QStringList>
 #include <QTranslator>
 #include <QVector>
 #include <cstdint>
 #include <memory>
 #include <optional>
 namespace MySettingsEnums {
@ -245,7 +247,7 @@ private:
    const QStringList m_deviceList;
    const QStringList m_embeddingsDeviceList;
    const QStringList m_uiLanguages;
-    QTranslator *m_translator = nullptr;
+    std::unique_ptr<QTranslator> m_translator;
 private:
    explicit MySettings();
--- a/gpt4all-chat/src/network.cpp
+++ b/gpt4all-chat/src/network.cpp
@ -9,7 +9,7 @@
 #include "modellist.h"
 #include "mysettings.h"
-#include "../gpt4all-backend/llmodel.h"
+#include <gpt4all-backend/llmodel.h>
 #include <QCoreApplication>
 #include <QDateTime>
@ -19,6 +19,7 @@
 #include <QJsonArray>
 #include <QJsonDocument>
 #include <QJsonObject>
 #include <QLibraryInfo>
 #include <QNetworkRequest>
 #include <QScreen>
 #include <QSettings>
@ -36,22 +37,51 @@
 #include <cstring>
 #include <utility>
 #ifdef __GLIBC__
 #   include <gnu/libc-version.h>
 #endif
 using namespace Qt::Literals::StringLiterals;
 //#define DEBUG
 #define STR_(x) #x
 #define STR(x) STR_(x)
 static const char MIXPANEL_TOKEN[] = "ce362e568ddaee16ed243eaffb5860a2";
 #ifdef __clang__
 #ifdef __apple_build_version__
 static const char COMPILER_NAME[] = "Apple Clang";
 #else
 static const char COMPILER_NAME[] = "LLVM Clang";
 #endif
 static const char COMPILER_VER[]  = STR(__clang_major__) "." STR(__clang_minor__) "." STR(__clang_patchlevel__);
 #elifdef _MSC_VER
 static const char COMPILER_NAME[] = "MSVC";
 static const char COMPILER_VER[]  = STR(_MSC_VER) " (" STR(_MSC_FULL_VER) ")";
 #elifdef __GNUC__
 static const char COMPILER_NAME[] = "GCC";
 static const char COMPILER_VER[]  = STR(__GNUC__) "." STR(__GNUC_MINOR__) "." STR(__GNUC_PATCHLEVEL__);
 #endif
 #if defined(Q_OS_MAC)
 #include <sys/sysctl.h>
-static QString getCPUModel()
+static std::optional<QString> getSysctl(const char *name)
 {
-    char buffer[256];
+    char buffer[256] = "";
    size_t bufferlen = sizeof(buffer);
-    sysctlbyname("machdep.cpu.brand_string", &buffer, &bufferlen, NULL, 0);
+    if (sysctlbyname(name, &buffer, &bufferlen, NULL, 0) < 0) {
-    return buffer;
+        int err = errno;
        qWarning().nospace() << "sysctlbyname(\"" << name << "\") failed: " << strerror(err);
        return std::nullopt;
    }
    return std::make_optional<QString>(buffer);
 }
 static QString getCPUModel() { return getSysctl("machdep.cpu.brand_string").value_or(u"(unknown)"_s); }
 #elif defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86)
@ -286,11 +316,35 @@ void Network::sendStartup()
    const auto *display = QGuiApplication::primaryScreen();
    trackEvent("startup", {
        // Build info
        { "build_compiler",     COMPILER_NAME                                                         },
        { "build_compiler_ver", COMPILER_VER                                                          },
        { "build_abi",          QSysInfo::buildAbi()                                                  },
        { "build_cpu_arch",     QSysInfo::buildCpuArchitecture()                                      },
 #ifdef __GLIBC__
        { "build_glibc_ver",    QStringLiteral(STR(__GLIBC__) "." STR(__GLIBC_MINOR__))               },
 #endif
        { "qt_version",         QLibraryInfo::version().toString()                                    },
        { "qt_debug" ,          QLibraryInfo::isDebugBuild()                                          },
        { "qt_shared",          QLibraryInfo::isSharedBuild()                                         },
        // System info
        { "runtime_cpu_arch",   QSysInfo::currentCpuArchitecture()                                    },
 #ifdef __GLIBC__
        { "runtime_glibc_ver",  gnu_get_libc_version()                                                },
 #endif
        { "sys_kernel_type",    QSysInfo::kernelType()                                                },
        { "sys_kernel_ver",     QSysInfo::kernelVersion()                                             },
        { "sys_product_type",   QSysInfo::productType()                                               },
        { "sys_product_ver",    QSysInfo::productVersion()                                            },
 #ifdef Q_OS_MAC
        { "sys_hw_model",       getSysctl("hw.model").value_or(u"(unknown)"_s)                        },
 #endif
        { "$screen_dpi",        std::round(display->physicalDotsPerInch())                            },
        { "display",            u"%1x%2"_s.arg(display->size().width()).arg(display->size().height()) },
        { "ram",                LLM::globalInstance()->systemTotalRAMInGB()                           },
        { "cpu",                getCPUModel()                                                         },
        { "cpu_supports_avx2",  LLModel::Implementation::cpuSupportsAVX2()                            },
        // Datalake status
        { "datalake_active",    mySettings->networkIsActive()                                         },
    });
    sendIpify();
@ -321,7 +375,6 @@ void Network::trackEvent(const QString &ev, const QVariantMap &props)
    if (!m_sendUsageStats)
        return;
    Q_ASSERT(ChatListModel::globalInstance()->currentChat());
    QJsonObject properties;
    properties.insert("token", MIXPANEL_TOKEN);
--- a/gpt4all-chat/src/network.h
+++ b/gpt4all-chat/src/network.h
--- a/gpt4all-chat/src/qml/AddCollectionView.qml
+++ b/gpt4all-chat/src/qml/AddCollectionView.qml
--- a/gpt4all-chat/src/qml/AddModelView.qml
+++ b/gpt4all-chat/src/qml/AddModelView.qml
@ -187,7 +187,12 @@ Rectangle {
                visible: false
                MyComboBox {
                    id: comboSort
-                    model: [qsTr("Default"), qsTr("Likes"), qsTr("Downloads"), qsTr("Recent")]
+                    model: ListModel {
                        ListElement { name: qsTr("Default") }
                        ListElement { name: qsTr("Likes") }
                        ListElement { name: qsTr("Downloads") }
                        ListElement { name: qsTr("Recent") }
                    }
                    currentIndex: ModelList.discoverSort
                    contentItem: Text {
                        anchors.horizontalCenter: parent.horizontalCenter
@ -207,7 +212,10 @@ Rectangle {
                }
                MyComboBox {
                    id: comboSortDirection
-                    model: [qsTr("Asc"), qsTr("Desc")]
+                    model: ListModel {
                        ListElement { name: qsTr("Asc") }
                        ListElement { name: qsTr("Desc") }
                    }
                    currentIndex: {
                        if (ModelList.discoverSortDirection === 1)
                            return 0
@ -235,7 +243,15 @@ Rectangle {
                }
                MyComboBox {
                    id: comboLimit
-                    model: ["5", "10", "20", "50", "100", qsTr("None")]
+                    model: ListModel {
                        ListElement { name: "5" }
                        ListElement { name: "10" }
                        ListElement { name: "20" }
                        ListElement { name: "50" }
                        ListElement { name: "100" }
                        ListElement { name: qsTr("None") }
                    }
                    currentIndex: {
                        if (ModelList.discoverLimit === 5)
                            return 0;
--- a/gpt4all-chat/src/qml/ApplicationSettings.qml
+++ b/gpt4all-chat/src/qml/ApplicationSettings.qml
@ -108,7 +108,11 @@ MySettingsTab {
            Layout.fillWidth: false
            Layout.alignment: Qt.AlignRight
            // NOTE: indices match values of ChatTheme enum, keep them in sync
-            model: [qsTr("Light"), qsTr("Dark"), qsTr("LegacyDark")]
+            model: ListModel {
                ListElement { name: qsTr("Light") }
                ListElement { name: qsTr("Dark") }
                ListElement { name: qsTr("LegacyDark") }
            }
            Accessible.name: themeLabel.text
            Accessible.description: themeLabel.helpText
            function updateModel() {
@ -143,7 +147,11 @@ MySettingsTab {
            Layout.fillWidth: false
            Layout.alignment: Qt.AlignRight
            // NOTE: indices match values of FontSize enum, keep them in sync
-            model: [qsTr("Small"), qsTr("Medium"), qsTr("Large")]
+            model: ListModel {
                ListElement { name: qsTr("Small") }
                ListElement { name: qsTr("Medium") }
                ListElement { name: qsTr("Large") }
            }
            Accessible.name: fontLabel.text
            Accessible.description: fontLabel.helpText
            function updateModel() {
@ -313,6 +321,12 @@ MySettingsTab {
                    defaultModelBox.updateModel()
                }
            }
            Connections {
                target: MySettings
                function onLanguageAndLocaleChanged() {
                    defaultModelBox.rebuildModel()
                }
            }
            Connections {
                target: ModelList
                function onSelectableModelListChanged() {
@ -335,7 +349,11 @@ MySettingsTab {
            Layout.maximumWidth: 400
            Layout.alignment: Qt.AlignRight
            // NOTE: indices match values of SuggestionMode enum, keep them in sync
-            model: [ qsTr("When chatting with LocalDocs"), qsTr("Whenever possible"), qsTr("Never") ]
+            model: ListModel {
                ListElement { name: qsTr("When chatting with LocalDocs") }
                ListElement { name: qsTr("Whenever possible") }
                ListElement { name: qsTr("Never") }
            }
            Accessible.name: suggestionModeLabel.text
            Accessible.description: suggestionModeLabel.helpText
            onActivated: {
@ -484,7 +502,7 @@ MySettingsTab {
        }
        MySettingsLabel {
            id: serverChatLabel
-            text: qsTr("Enable Local Server")
+            text: qsTr("Enable Local API Server")
            helpText: qsTr("Expose an OpenAI-Compatible server to localhost. WARNING: Results in increased resource usage.")
            Layout.row: 13
            Layout.column: 0
--- a/gpt4all-chat/src/qml/ChatDrawer.qml
+++ b/gpt4all-chat/src/qml/ChatDrawer.qml
--- a/gpt4all-chat/src/qml/ChatView.qml
+++ b/gpt4all-chat/src/qml/ChatView.qml
@ -834,7 +834,7 @@ Rectangle {
                                            to: 360
                                            duration: 1000
                                            loops: Animation.Infinite
-                                            running: currentResponse && (currentChat.responseInProgress || currentChat.isRecalc)
+                                            running: currentResponse && (currentChat.responseInProgress || currentChat.restoringFromText)
                                        }
                                    }
                                }
@ -867,13 +867,13 @@ Rectangle {
                                            color: theme.mutedTextColor
                                        }
                                        RowLayout {
-                                            visible: currentResponse && ((value === "" && currentChat.responseInProgress) || currentChat.isRecalc)
+                                            visible: currentResponse && ((value === "" && currentChat.responseInProgress) || currentChat.restoringFromText)
                                            Text {
                                                color: theme.mutedTextColor
                                                font.pixelSize: theme.fontSizeLarger
                                                text: {
-                                                    if (currentChat.isRecalc)
+                                                    if (currentChat.restoringFromText)
-                                                        return qsTr("recalculating context ...");
+                                                        return qsTr("restoring from text ...");
                                                    switch (currentChat.responseState) {
                                                    case Chat.ResponseStopped: return qsTr("response stopped ...");
                                                    case Chat.LocalDocsRetrieval: return qsTr("retrieving localdocs: %1 ...").arg(currentChat.collectionList.join(", "));
@ -1142,7 +1142,7 @@ Rectangle {
                                            }
                                            Text {
-                                                text: qsTr("%1 Sources").arg(consolidatedSources.length)
+                                                text: qsTr("%n Source(s)", "", consolidatedSources.length)
                                                padding: 0
                                                font.pixelSize: theme.fontSizeLarge
                                                font.bold: true
@ -1861,7 +1861,7 @@ Rectangle {
                                              }
                                          }
                    function sendMessage() {
-                        if (textInput.text === "" || currentChat.responseInProgress || currentChat.isRecalc)
+                        if (textInput.text === "" || currentChat.responseInProgress || currentChat.restoringFromText)
                            return
                        currentChat.stopGenerating()
--- a/gpt4all-chat/src/qml/CollectionsDrawer.qml
+++ b/gpt4all-chat/src/qml/CollectionsDrawer.qml
--- a/gpt4all-chat/src/qml/HomeView.qml
+++ b/gpt4all-chat/src/qml/HomeView.qml
@ -76,8 +76,8 @@ Rectangle {
                MyWelcomeButton {
                    Layout.fillWidth: true
-                    Layout.maximumWidth: 500
+                    Layout.maximumWidth: 150 + 200 * theme.fontScale
-                    Layout.preferredHeight: 150
+                    Layout.preferredHeight: 40 + 90 * theme.fontScale
                    text: qsTr("Start Chatting")
                    description: qsTr("Chat with any LLM")
                    imageSource: "qrc:/gpt4all/icons/chat.svg"
@ -87,8 +87,8 @@ Rectangle {
                }
                MyWelcomeButton {
                    Layout.fillWidth: true
-                    Layout.maximumWidth: 500
+                    Layout.maximumWidth: 150 + 200 * theme.fontScale
-                    Layout.preferredHeight: 150
+                    Layout.preferredHeight: 40 + 90 * theme.fontScale
                    text: qsTr("LocalDocs")
                    description: qsTr("Chat with your local files")
                    imageSource: "qrc:/gpt4all/icons/db.svg"
@ -98,8 +98,8 @@ Rectangle {
                }
                MyWelcomeButton {
                    Layout.fillWidth: true
-                    Layout.maximumWidth: 500
+                    Layout.maximumWidth: 150 + 200 * theme.fontScale
-                    Layout.preferredHeight: 150
+                    Layout.preferredHeight: 40 + 90 * theme.fontScale
                    text: qsTr("Find Models")
                    description: qsTr("Explore and download models")
                    imageSource: "qrc:/gpt4all/icons/models.svg"
@ -254,9 +254,9 @@ Rectangle {
                    spacing: 40
                    MyFancyLink {
-                        text: qsTr("GPT4All.io")
+                        text: qsTr("nomic.ai")
                        imageSource: "qrc:/gpt4all/icons/globe.svg"
-                        onClicked: { Qt.openUrlExternally("https://gpt4all.io") }
+                        onClicked: { Qt.openUrlExternally("https://www.nomic.ai/gpt4all") }
                        rightPadding: 15
                    }
                }
--- a/gpt4all-chat/src/qml/LocalDocsSettings.qml
+++ b/gpt4all-chat/src/qml/LocalDocsSettings.qml
@ -163,7 +163,7 @@ MySettingsTab {
            MySettingsLabel {
                id: deviceLabel
                text: qsTr("Embeddings Device")
-                helpText: qsTr('The compute device used for embeddings. "Auto" uses the CPU. Requires restart.')
+                helpText: qsTr('The compute device used for embeddings. Requires restart.')
            }
            MyComboBox {
                id: deviceBox
@ -172,11 +172,18 @@ MySettingsTab {
                Layout.maximumWidth: 400
                Layout.fillWidth: false
                Layout.alignment: Qt.AlignRight
-                model: MySettings.embeddingsDeviceList
+                model: ListModel {
                    ListElement { text: qsTr("Application default") }
                    Component.onCompleted: {
                        MySettings.embeddingsDeviceList.forEach(d => append({"text": d}));
                    }
                }
                Accessible.name: deviceLabel.text
                Accessible.description: deviceLabel.helpText
                function updateModel() {
-                    deviceBox.currentIndex = deviceBox.indexOfValue(MySettings.localDocsEmbedDevice);
+                    var device = MySettings.localDocsEmbedDevice;
                    // This usage of 'Auto' should not be translated
                    deviceBox.currentIndex = device === "Auto" ? 0 : deviceBox.indexOfValue(device);
                }
                Component.onCompleted: {
                    deviceBox.updateModel();
@ -188,7 +195,8 @@ MySettingsTab {
                    }
                }
                onActivated: {
-                    MySettings.localDocsEmbedDevice = deviceBox.currentText;
+                    // This usage of 'Auto' should not be translated
                    MySettings.localDocsEmbedDevice = deviceBox.currentIndex === 0 ? "Auto" : deviceBox.currentText;
                }
            }
        }
--- a/gpt4all-chat/src/qml/LocalDocsView.qml
+++ b/gpt4all-chat/src/qml/LocalDocsView.qml
--- a/gpt4all-chat/src/qml/ModelSettings.qml
+++ b/gpt4all-chat/src/qml/ModelSettings.qml
@ -456,7 +456,7 @@ MySettingsTab {
            MySettingsLabel {
                id: topPLabel
                text: qsTr("Top-P")
-                helpText: qsTr("Nucleus Sampling factor. Lower -> more predicatable.")
+                helpText: qsTr("Nucleus Sampling factor. Lower -> more predictable.")
                Layout.row: 2
                Layout.column: 0
                Layout.maximumWidth: 300 * theme.fontScale
--- a/gpt4all-chat/src/qml/ModelsView.qml
+++ b/gpt4all-chat/src/qml/ModelsView.qml
--- a/gpt4all-chat/src/qml/MyBusyIndicator.qml
+++ b/gpt4all-chat/src/qml/MyBusyIndicator.qml
--- a/gpt4all-chat/src/qml/MyButton.qml
+++ b/gpt4all-chat/src/qml/MyButton.qml
--- a/gpt4all-chat/src/qml/MyCheckBox.qml
+++ b/gpt4all-chat/src/qml/MyCheckBox.qml
--- a/gpt4all-chat/src/qml/MyComboBox.qml
+++ b/gpt4all-chat/src/qml/MyComboBox.qml
--- a/gpt4all-chat/src/qml/MyDialog.qml
+++ b/gpt4all-chat/src/qml/MyDialog.qml
--- a/gpt4all-chat/src/qml/MyDirectoryField.qml
+++ b/gpt4all-chat/src/qml/MyDirectoryField.qml
--- a/gpt4all-chat/src/qml/MyFancyLink.qml
+++ b/gpt4all-chat/src/qml/MyFancyLink.qml
--- a/gpt4all-chat/src/qml/MyMenu.qml
+++ b/gpt4all-chat/src/qml/MyMenu.qml
--- a/gpt4all-chat/src/qml/MyMenuItem.qml
+++ b/gpt4all-chat/src/qml/MyMenuItem.qml
--- a/gpt4all-chat/src/qml/MyMiniButton.qml
+++ b/gpt4all-chat/src/qml/MyMiniButton.qml
--- a/gpt4all-chat/src/qml/MySettingsButton.qml
+++ b/gpt4all-chat/src/qml/MySettingsButton.qml
--- a/gpt4all-chat/src/qml/MySettingsDestructiveButton.qml
+++ b/gpt4all-chat/src/qml/MySettingsDestructiveButton.qml
--- a/gpt4all-chat/src/qml/MySettingsLabel.qml
+++ b/gpt4all-chat/src/qml/MySettingsLabel.qml
--- a/Show More
+++ b/Show More
		`@ -0,0 +1 @@`
							`Subproject commit ced74fbad4b258507f3ec06e77eec9445583511a`
		`@ -1 +0,0 @@`
			`Subproject commit add387854ea73d83770a62282089dea666fa266f`
		`@ -0,0 +1 @@`
							`Subproject commit 21bdef01eddcbd78044eea1d50b9dee08d218ff2`
		`@ -0,0 +1 @@`
							`Subproject commit 0c9fce2ffefecfdce794e1859584e25877b7b592`