server: improve correctness of request parsing and responses (#2929)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2024-10-01 01:06:10 -04:00 · 2024-09-09 10:48:57 -04:00 · 2024-09-09 10:48:57 -04:00 · 39005288c5
commit 39005288c5
parent 1aae4ffe0a
22 changed files with 790 additions and 328 deletions
--- a/.circleci/continue_config.yml
+++ b/.circleci/continue_config.yml
@ -317,9 +317,9 @@ jobs:
            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
            sudo dpkg -i cuda-keyring_1.1-1_all.deb
            packages=(
-              bison build-essential ccache cuda-compiler-11-8 flex gperf libcublas-dev-11-8 libfontconfig1 libfreetype6
+              bison build-essential ccache cuda-compiler-11-8 flex g++-12 gperf libcublas-dev-11-8 libfontconfig1
-              libgl1-mesa-dev libmysqlclient21 libnvidia-compute-550-server libodbc2 libpq5 libwayland-dev libx11-6
+              libfreetype6 libgl1-mesa-dev libmysqlclient21 libnvidia-compute-550-server libodbc2 libpq5 libwayland-dev
-              libx11-xcb1 libxcb-cursor0 libxcb-glx0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1 libxcb-randr0
+              libx11-6 libx11-xcb1 libxcb-cursor0 libxcb-glx0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1 libxcb-randr0
              libxcb-render-util0 libxcb-shape0 libxcb-shm0 libxcb-sync1 libxcb-util1 libxcb-xfixes0 libxcb-xinerama0
              libxcb-xkb1 libxcb1 libxext6 libxfixes3 libxi6 libxkbcommon-x11-0 libxkbcommon0 libxrender1 patchelf
              python3 vulkan-sdk
@ -352,6 +352,8 @@ jobs:
            ~/Qt/Tools/CMake/bin/cmake \
              -S ../gpt4all-chat -B . \
              -DCMAKE_BUILD_TYPE=Release \
              -DCMAKE_C_COMPILER=gcc-12 \
              -DCMAKE_CXX_COMPILER=g++-12 \
              -DCMAKE_C_COMPILER_LAUNCHER=ccache \
              -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
              -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
@ -391,9 +393,9 @@ jobs:
            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
            sudo dpkg -i cuda-keyring_1.1-1_all.deb
            packages=(
-              bison build-essential ccache cuda-compiler-11-8 flex gperf libcublas-dev-11-8 libfontconfig1 libfreetype6
+              bison build-essential ccache cuda-compiler-11-8 flex g++-12 gperf libcublas-dev-11-8 libfontconfig1
-              libgl1-mesa-dev libmysqlclient21 libnvidia-compute-550-server libodbc2 libpq5 libwayland-dev libx11-6
+              libfreetype6 libgl1-mesa-dev libmysqlclient21 libnvidia-compute-550-server libodbc2 libpq5 libwayland-dev
-              libx11-xcb1 libxcb-cursor0 libxcb-glx0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1 libxcb-randr0
+              libx11-6 libx11-xcb1 libxcb-cursor0 libxcb-glx0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1 libxcb-randr0
              libxcb-render-util0 libxcb-shape0 libxcb-shm0 libxcb-sync1 libxcb-util1 libxcb-xfixes0 libxcb-xinerama0
              libxcb-xkb1 libxcb1 libxext6 libxfixes3 libxi6 libxkbcommon-x11-0 libxkbcommon0 libxrender1 patchelf
              python3 vulkan-sdk
@ -426,6 +428,8 @@ jobs:
            ~/Qt/Tools/CMake/bin/cmake \
              -S ../gpt4all-chat -B . \
              -DCMAKE_BUILD_TYPE=Release \
              -DCMAKE_C_COMPILER=gcc-12 \
              -DCMAKE_CXX_COMPILER=g++-12 \
              -DCMAKE_C_COMPILER_LAUNCHER=ccache \
              -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
              -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
@ -447,7 +451,7 @@ jobs:
  build-offline-chat-installer-windows:
    machine:
-      image: 'windows-server-2019-vs2019:2022.08.1'
+      image: windows-server-2022-gui:current
      resource_class: windows.large
      shell: powershell.exe -ExecutionPolicy Bypass
    steps:
@ -538,7 +542,7 @@ jobs:
  sign-offline-chat-installer-windows:
    machine:
-      image: 'windows-server-2019-vs2019:2022.08.1'
+      image: windows-server-2022-gui:current
      resource_class: windows.large
      shell: powershell.exe -ExecutionPolicy Bypass
    steps:
@ -568,7 +572,7 @@ jobs:
  build-online-chat-installer-windows:
    machine:
-      image: 'windows-server-2019-vs2019:2022.08.1'
+      image: windows-server-2022-gui:current
      resource_class: windows.large
      shell: powershell.exe -ExecutionPolicy Bypass
    steps:
@ -666,7 +670,7 @@ jobs:
  sign-online-chat-installer-windows:
    machine:
-      image: 'windows-server-2019-vs2019:2022.08.1'
+      image: windows-server-2022-gui:current
      resource_class: windows.large
      shell: powershell.exe -ExecutionPolicy Bypass
    steps:
@ -720,9 +724,9 @@ jobs:
            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
            sudo dpkg -i cuda-keyring_1.1-1_all.deb
            packages=(
-              bison build-essential ccache cuda-compiler-11-8 flex gperf libcublas-dev-11-8 libfontconfig1 libfreetype6
+              bison build-essential ccache cuda-compiler-11-8 flex g++-12 gperf libcublas-dev-11-8 libfontconfig1
-              libgl1-mesa-dev libmysqlclient21 libnvidia-compute-550-server libodbc2 libpq5 libwayland-dev libx11-6
+              libfreetype6 libgl1-mesa-dev libmysqlclient21 libnvidia-compute-550-server libodbc2 libpq5 libwayland-dev
-              libx11-xcb1 libxcb-cursor0 libxcb-glx0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1 libxcb-randr0
+              libx11-6 libx11-xcb1 libxcb-cursor0 libxcb-glx0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1 libxcb-randr0
              libxcb-render-util0 libxcb-shape0 libxcb-shm0 libxcb-sync1 libxcb-util1 libxcb-xfixes0 libxcb-xinerama0
              libxcb-xkb1 libxcb1 libxext6 libxfixes3 libxi6 libxkbcommon-x11-0 libxkbcommon0 libxrender1 python3
              vulkan-sdk
@ -744,6 +748,8 @@ jobs:
            ~/Qt/Tools/CMake/bin/cmake \
              -S gpt4all-chat -B build \
              -DCMAKE_BUILD_TYPE=Release \
              -DCMAKE_C_COMPILER=gcc-12 \
              -DCMAKE_CXX_COMPILER=g++-12 \
              -DCMAKE_C_COMPILER_LAUNCHER=ccache \
              -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
              -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
@ -758,7 +764,7 @@ jobs:
  build-gpt4all-chat-windows:
    machine:
-      image: 'windows-server-2019-vs2019:2022.08.1'
+      image: windows-server-2022-gui:current
      resource_class: windows.large
      shell: powershell.exe -ExecutionPolicy Bypass
    steps:
@ -864,8 +870,8 @@ jobs:
          paths:
            - ../.ccache
-  build-ts-docs: 
+  build-ts-docs:
-    docker: 
+    docker:
      - image: cimg/base:stable
    steps:
      - checkout
@ -887,7 +893,7 @@ jobs:
    docker:
      - image: circleci/python:3.8
    steps:
-      - checkout 
+      - checkout
      - run:
          name: Install dependencies
          command: |
@ -928,7 +934,8 @@ jobs:
            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
            sudo dpkg -i cuda-keyring_1.1-1_all.deb
            packages=(
-              build-essential ccache cmake cuda-compiler-11-8 libcublas-dev-11-8 libnvidia-compute-550-server vulkan-sdk
+              build-essential ccache cmake cuda-compiler-11-8 g++-12 libcublas-dev-11-8 libnvidia-compute-550-server
              vulkan-sdk
            )
            sudo apt-get update
            sudo apt-get install -y "${packages[@]}"
@ -942,6 +949,8 @@ jobs:
            cd gpt4all-backend
            cmake -B build \
              -DCMAKE_BUILD_TYPE=Release \
              -DCMAKE_C_COMPILER=gcc-12 \
              -DCMAKE_CXX_COMPILER=g++-12 \
              -DCMAKE_C_COMPILER_LAUNCHER=ccache \
              -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
              -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
@ -1014,7 +1023,7 @@ jobs:
  build-py-windows:
    machine:
-      image: 'windows-server-2019-vs2019:2022.08.1'
+      image: windows-server-2022-gui:current
      resource_class: windows.large
      shell: powershell.exe -ExecutionPolicy Bypass
    steps:
@ -1118,11 +1127,12 @@ jobs:
          name: Install dependencies
          command: |
            wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
-            sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list            
+            sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
            sudo dpkg -i cuda-keyring_1.1-1_all.deb
            packages=(
-              build-essential ccache cmake cuda-compiler-11-8 libcublas-dev-11-8 libnvidia-compute-550-server vulkan-sdk
+              build-essential ccache cmake cuda-compiler-11-8 g++-12 libcublas-dev-11-8 libnvidia-compute-550-server
              vulkan-sdk
            )
            sudo apt-get update
            sudo apt-get install -y "${packages[@]}"
@ -1135,6 +1145,9 @@ jobs:
            mkdir -p runtimes/build
            cd runtimes/build
            cmake ../.. \
              -DCMAKE_BUILD_TYPE=Release \
              -DCMAKE_C_COMPILER=gcc-12 \
              -DCMAKE_C_COMPILER=g++-12 \
              -DCMAKE_BUILD_TYPE=Release \
              -DCMAKE_C_COMPILER_LAUNCHER=ccache \
              -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
@ -1204,7 +1217,7 @@ jobs:
  build-bindings-backend-windows:
    machine:
-      image: 'windows-server-2022-gui:2023.03.1'
+      image: windows-server-2022-gui:current
      resource_class: windows.large
      shell: powershell.exe -ExecutionPolicy Bypass
    steps:
@ -1230,7 +1243,7 @@ jobs:
      - run:
          name: Install dependencies
          command: |
-            choco install -y ccache cmake ninja --installargs 'ADD_CMAKE_TO_PATH=System'  
+            choco install -y ccache cmake ninja --installargs 'ADD_CMAKE_TO_PATH=System'
      - run:
          name: Build Libraries
          command: |
@ -1263,8 +1276,8 @@ jobs:
          paths:
            - runtimes/win-x64_msvc/*.dll
-  build-nodejs-linux: 
+  build-nodejs-linux:
-    docker: 
+    docker:
      - image: cimg/base:stable
    steps:
      - checkout
@ -1280,10 +1293,10 @@ jobs:
          pkg-manager: yarn
          override-ci-command: yarn install
      - run:
-          command: | 
+          command: |
            cd gpt4all-bindings/typescript
            yarn prebuildify -t 18.16.0 --napi
-      - run: 
+      - run:
          command: |
            mkdir -p gpt4all-backend/prebuilds/linux-x64
            mkdir -p gpt4all-backend/runtimes/linux-x64
@ -1292,10 +1305,10 @@ jobs:
      - persist_to_workspace:
          root: gpt4all-backend
          paths:
-            - prebuilds/linux-x64/*.node 
+            - prebuilds/linux-x64/*.node
            - runtimes/linux-x64/*-*.so
-  build-nodejs-macos: 
+  build-nodejs-macos:
    macos:
      xcode: 15.4.0
    steps:
@ -1312,12 +1325,12 @@ jobs:
          pkg-manager: yarn
          override-ci-command: yarn install
      - run:
-          command: | 
+          command: |
            cd gpt4all-bindings/typescript
            yarn prebuildify -t 18.16.0 --napi
-      - run: 
+      - run:
          name: "Persisting all necessary things to workspace"
-          command: |  
+          command: |
            mkdir -p gpt4all-backend/prebuilds/darwin-x64
            mkdir -p gpt4all-backend/runtimes/darwin
            cp /tmp/gpt4all-backend/runtimes/osx-x64/*-*.* gpt4all-backend/runtimes/darwin
@ -1328,7 +1341,7 @@ jobs:
            - prebuilds/darwin-x64/*.node
            - runtimes/darwin/*-*.*
-  build-nodejs-windows: 
+  build-nodejs-windows:
    executor:
      name: win/default
      size: large
@ -1342,29 +1355,29 @@ jobs:
         command: wget https://nodejs.org/dist/v18.16.0/node-v18.16.0-x86.msi -P C:\Users\circleci\Downloads\
         shell: cmd.exe
      - run: MsiExec.exe /i C:\Users\circleci\Downloads\node-v18.16.0-x86.msi /qn
-      - run: 
+      - run:
          command: |
              Start-Process powershell -verb runAs -Args "-start GeneralProfile"
              nvm install 18.16.0
              nvm use 18.16.0
-      - run: node --version 
+      - run: node --version
      - run: corepack enable
-      - run:           
+      - run:
          command: |
            npm install -g yarn
            cd gpt4all-bindings/typescript
            yarn install
      - run:
-          command: | 
+          command: |
            cd gpt4all-bindings/typescript
-            yarn prebuildify -t 18.16.0 --napi 
+            yarn prebuildify -t 18.16.0 --napi
-      - run: 
+      - run:
          command: |
            mkdir -p gpt4all-backend/prebuilds/win32-x64
            mkdir -p gpt4all-backend/runtimes/win32-x64
            cp /tmp/gpt4all-backend/runtimes/win-x64_msvc/*-*.dll gpt4all-backend/runtimes/win32-x64
            cp gpt4all-bindings/typescript/prebuilds/win32-x64/*.node gpt4all-backend/prebuilds/win32-x64
-        
+
      - persist_to_workspace:
          root: gpt4all-backend
          paths:
@ -1372,7 +1385,7 @@ jobs:
            - runtimes/win32-x64/*-*.dll
  prepare-npm-pkg:
-    docker: 
+    docker:
      - image: cimg/base:stable
    steps:
      - attach_workspace:
@ -1383,19 +1396,19 @@ jobs:
          node-version: "18.16"
      - run: node --version
      - run: corepack enable
-      - run: 
+      - run:
          command: |
            cd gpt4all-bindings/typescript
            # excluding llmodel. nodejs bindings dont need llmodel.dll
            mkdir -p runtimes/win32-x64/native
            mkdir -p prebuilds/win32-x64/
-            cp /tmp/gpt4all-backend/runtimes/win-x64_msvc/*-*.dll runtimes/win32-x64/native/ 
+            cp /tmp/gpt4all-backend/runtimes/win-x64_msvc/*-*.dll runtimes/win32-x64/native/
-            cp /tmp/gpt4all-backend/prebuilds/win32-x64/*.node prebuilds/win32-x64/    
+            cp /tmp/gpt4all-backend/prebuilds/win32-x64/*.node prebuilds/win32-x64/
-            mkdir -p runtimes/linux-x64/native 
+            mkdir -p runtimes/linux-x64/native
            mkdir -p prebuilds/linux-x64/
-            cp /tmp/gpt4all-backend/runtimes/linux-x64/*-*.so runtimes/linux-x64/native/ 
+            cp /tmp/gpt4all-backend/runtimes/linux-x64/*-*.so runtimes/linux-x64/native/
-            cp /tmp/gpt4all-backend/prebuilds/linux-x64/*.node prebuilds/linux-x64/    
+            cp /tmp/gpt4all-backend/prebuilds/linux-x64/*.node prebuilds/linux-x64/
            # darwin has univeral runtime libraries
            mkdir -p runtimes/darwin/native
@ -1403,22 +1416,22 @@ jobs:
            cp /tmp/gpt4all-backend/runtimes/darwin/*-*.* runtimes/darwin/native/
-            cp /tmp/gpt4all-backend/prebuilds/darwin-x64/*.node prebuilds/darwin-x64/    
+            cp /tmp/gpt4all-backend/prebuilds/darwin-x64/*.node prebuilds/darwin-x64/
-            
+
            # Fallback build if user is not on above prebuilds
            mv -f binding.ci.gyp binding.gyp
            mkdir gpt4all-backend
            cd ../../gpt4all-backend
            mv llmodel.h llmodel.cpp llmodel_c.cpp llmodel_c.h sysinfo.h dlhandle.h ../gpt4all-bindings/typescript/gpt4all-backend/
-            
+
      # Test install
      - node/install-packages:
          app-dir: gpt4all-bindings/typescript
          pkg-manager: yarn
          override-ci-command: yarn install
-      - run: 
+      - run:
-          command: | 
+          command: |
            cd gpt4all-bindings/typescript
            yarn run test
      - run:
@ -1552,7 +1565,7 @@ workflows:
            - build-py-linux
            - build-py-macos
  build-bindings:
-    when: 
+    when:
      or:
       - << pipeline.parameters.run-all-workflows >>
       - << pipeline.parameters.run-python-workflow >>
@ -1585,8 +1598,8 @@ workflows:
          requires:
            - hold
-      # NodeJs Jobs 
+      # NodeJs Jobs
-      - prepare-npm-pkg: 
+      - prepare-npm-pkg:
          filters:
            branches:
              only:
--- a/.gitmodules
+++ b/.gitmodules
@ -8,3 +8,6 @@
 [submodule "gpt4all-chat/deps/SingleApplication"]
 	path = gpt4all-chat/deps/SingleApplication
 	url = https://github.com/nomic-ai/SingleApplication.git
 [submodule "gpt4all-chat/deps/fmt"]
 	path = gpt4all-chat/deps/fmt
 	url = https://github.com/fmtlib/fmt.git
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@ -33,7 +33,7 @@ set(LLMODEL_VERSION_PATCH 0)
 set(LLMODEL_VERSION "${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL_VERSION_PATCH}")
 project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C)
-set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD 23)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
 set(BUILD_SHARED_LIBS ON)
--- a/gpt4all-backend/deps/llama.cpp-mainline
+++ b/gpt4all-backend/deps/llama.cpp-mainline
@ -1 +1 @@
-Subproject commit 443665aec4721ecf57df8162e7e093a0cd674a76
+Subproject commit ced74fbad4b258507f3ec06e77eec9445583511a
--- a/gpt4all-backend/include/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/include/gpt4all-backend/llmodel.h
@ -162,7 +162,7 @@ public:
                        bool allowContextShift,
                        PromptContext &ctx,
                        bool special = false,
-                        std::string *fakeReply = nullptr);
+                        std::optional<std::string_view> fakeReply = {});
    using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend);
@ -212,7 +212,7 @@ public:
 protected:
    // These are pure virtual because subclasses need to implement as the default implementation of
    // 'prompt' above calls these functions
-    virtual std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special = false) = 0;
+    virtual std::vector<Token> tokenize(PromptContext &ctx, std::string_view str, bool special = false) = 0;
    virtual bool isSpecialToken(Token id) const = 0;
    virtual std::string tokenToString(Token id) const = 0;
    virtual Token sampleToken(PromptContext &ctx) const = 0;
@ -249,7 +249,8 @@ protected:
                      std::function<bool(int32_t, const std::string&)> responseCallback,
                      bool allowContextShift,
                      PromptContext &promptCtx,
-                      std::vector<Token> embd_inp);
+                      std::vector<Token> embd_inp,
                      bool isResponse = false);
    void generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
                          bool allowContextShift,
                          PromptContext &promptCtx);
--- a/gpt4all-backend/src/llamamodel.cpp
+++ b/gpt4all-backend/src/llamamodel.cpp
@ -536,13 +536,13 @@ size_t LLamaModel::restoreState(const uint8_t *src)
    return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
 }
-std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str, bool special)
+std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, std::string_view str, bool special)
 {
    bool atStart = m_tokenize_last_token == -1;
    bool insertSpace = atStart || isSpecialToken(m_tokenize_last_token);
    std::vector<LLModel::Token> fres(str.length() + 4);
    int32_t fres_len = llama_tokenize_gpt4all(
-        d_ptr->model, str.c_str(), str.length(), fres.data(), fres.size(), /*add_special*/ atStart,
+        d_ptr->model, str.data(), str.length(), fres.data(), fres.size(), /*add_special*/ atStart,
        /*parse_special*/ special, /*insert_space*/ insertSpace
    );
    fres.resize(fres_len);
--- a/gpt4all-backend/src/llamamodel_impl.h
+++ b/gpt4all-backend/src/llamamodel_impl.h
@ -8,6 +8,7 @@
 #include <memory>
 #include <string>
 #include <string_view>
 #include <vector>
 struct LLamaPrivate;
@ -52,7 +53,7 @@ private:
    bool m_supportsCompletion = false;
 protected:
-    std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) override;
+    std::vector<Token> tokenize(PromptContext &ctx, std::string_view str, bool special) override;
    bool isSpecialToken(Token id) const override;
    std::string tokenToString(Token id) const override;
    Token sampleToken(PromptContext &ctx) const override;
--- a/gpt4all-backend/src/llmodel_c.cpp
+++ b/gpt4all-backend/src/llmodel_c.cpp
@ -12,6 +12,7 @@
 #include <memory>
 #include <optional>
 #include <string>
 #include <string_view>
 #include <vector>
 struct LLModelWrapper {
@ -130,13 +131,10 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
    wrapper->promptContext.repeat_last_n = ctx->repeat_last_n;
    wrapper->promptContext.contextErase = ctx->context_erase;
    std::string fake_reply_str;
    if (fake_reply) { fake_reply_str = fake_reply; }
    auto *fake_reply_p = fake_reply ? &fake_reply_str : nullptr;
    // Call the C++ prompt method
    wrapper->llModel->prompt(prompt, prompt_template, prompt_callback, response_func, allow_context_shift,
-                             wrapper->promptContext, special, fake_reply_p);
+                             wrapper->promptContext, special,
                             fake_reply ? std::make_optional<std::string_view>(fake_reply) : std::nullopt);
    // Update the C context by giving access to the wrappers raw pointers to std::vector data
    // which involves no copies
--- a/gpt4all-backend/src/llmodel_shared.cpp
+++ b/gpt4all-backend/src/llmodel_shared.cpp
@ -11,6 +11,7 @@
 #include <sstream>
 #include <stdexcept>
 #include <string>
 #include <string_view>
 #include <vector>
 namespace ranges = std::ranges;
@ -45,7 +46,7 @@ void LLModel::prompt(const std::string &prompt,
                     bool allowContextShift,
                     PromptContext &promptCtx,
                     bool special,
-                     std::string *fakeReply)
+                     std::optional<std::string_view> fakeReply)
 {
    if (!isModelLoaded()) {
        std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
@ -129,11 +130,11 @@ void LLModel::prompt(const std::string &prompt,
        return; // error
    // decode the assistant's reply, either generated or spoofed
-    if (fakeReply == nullptr) {
+    if (!fakeReply) {
        generateResponse(responseCallback, allowContextShift, promptCtx);
    } else {
        embd_inp = tokenize(promptCtx, *fakeReply, false);
-        if (!decodePrompt(promptCallback, responseCallback, allowContextShift, promptCtx, embd_inp))
+        if (!decodePrompt(promptCallback, responseCallback, allowContextShift, promptCtx, embd_inp, true))
            return; // error
    }
@ -157,7 +158,8 @@ bool LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
                           std::function<bool(int32_t, const std::string&)> responseCallback,
                           bool allowContextShift,
                           PromptContext &promptCtx,
-                           std::vector<Token> embd_inp) {
+                           std::vector<Token> embd_inp,
                           bool isResponse) {
    if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
        responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
        std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
@ -196,7 +198,9 @@ bool LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
        for (size_t t = 0; t < tokens; ++t) {
            promptCtx.tokens.push_back(batch.at(t));
            promptCtx.n_past += 1;
-            if (!promptCallback(batch.at(t)))
+            Token tok = batch.at(t);
            bool res = isResponse ? responseCallback(tok, tokenToString(tok)) : promptCallback(tok);
            if (!res)
                return false;
        }
        i = batch_end;
--- a/gpt4all-chat/CHANGELOG.md
+++ b/gpt4all-chat/CHANGELOG.md
@ -26,6 +26,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 - Fix a typo in Model Settings (by [@3Simplex](https://github.com/3Simplex) in [#2916](https://github.com/nomic-ai/gpt4all/pull/2916))
 - Fix the antenna icon tooltip when using the local server ([#2922](https://github.com/nomic-ai/gpt4all/pull/2922))
 - Fix a few issues with locating files and handling errors when loading remote models on startup ([#2875](https://github.com/nomic-ai/gpt4all/pull/2875))
 - Significantly improve API server request parsing and response correctness ([#2929](https://github.com/nomic-ai/gpt4all/pull/2929))
 ## [3.2.1] - 2024-08-13
--- a/gpt4all-chat/CMakeLists.txt
+++ b/gpt4all-chat/CMakeLists.txt
@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.16)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD 23)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 if(APPLE)
@ -64,6 +64,12 @@ message(STATUS "Qt 6 root directory: ${Qt6_ROOT_DIR}")
 set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(FMT_INSTALL OFF)
 set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}")
 set(BUILD_SHARED_LIBS OFF)
 add_subdirectory(deps/fmt)
 set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}")
 add_subdirectory(../gpt4all-backend llmodel)
 set(CHAT_EXE_RESOURCES)
@ -240,7 +246,7 @@ else()
    PRIVATE Qt6::Quick Qt6::Svg Qt6::HttpServer Qt6::Sql Qt6::Pdf)
 endif()
 target_link_libraries(chat
-    PRIVATE llmodel SingleApplication)
+    PRIVATE llmodel SingleApplication fmt::fmt)
 # -- install --
--- a/gpt4all-chat/deps/fmt
+++ b/gpt4all-chat/deps/fmt
@ -0,0 +1 @@
 Subproject commit 0c9fce2ffefecfdce794e1859584e25877b7b592
--- a/gpt4all-chat/src/chat.cpp
+++ b/gpt4all-chat/src/chat.cpp
@ -239,16 +239,17 @@ void Chat::newPromptResponsePair(const QString &prompt)
    resetResponseState();
    m_chatModel->updateCurrentResponse(m_chatModel->count() - 1, false);
    m_chatModel->appendPrompt("Prompt: ", prompt);
-    m_chatModel->appendResponse("Response: ", prompt);
+    m_chatModel->appendResponse("Response: ", QString());
    emit resetResponseRequested();
 }
 // the server needs to block until response is reset, so it calls resetResponse on its own m_llmThread
 void Chat::serverNewPromptResponsePair(const QString &prompt)
 {
    resetResponseState();
    m_chatModel->updateCurrentResponse(m_chatModel->count() - 1, false);
    m_chatModel->appendPrompt("Prompt: ", prompt);
-    m_chatModel->appendResponse("Response: ", prompt);
+    m_chatModel->appendResponse("Response: ", QString());
 }
 bool Chat::restoringFromText() const
--- a/gpt4all-chat/src/chatapi.cpp
+++ b/gpt4all-chat/src/chatapi.cpp
@ -93,7 +93,7 @@ void ChatAPI::prompt(const std::string &prompt,
                     bool allowContextShift,
                     PromptContext &promptCtx,
                     bool special,
-                     std::string *fakeReply) {
+                     std::optional<std::string_view> fakeReply) {
    Q_UNUSED(promptCallback);
    Q_UNUSED(allowContextShift);
@ -121,7 +121,7 @@ void ChatAPI::prompt(const std::string &prompt,
    if (fakeReply) {
        promptCtx.n_past += 1;
        m_context.append(formattedPrompt);
-        m_context.append(QString::fromStdString(*fakeReply));
+        m_context.append(QString::fromUtf8(fakeReply->data(), fakeReply->size()));
        return;
    }
--- a/gpt4all-chat/src/chatapi.h
+++ b/gpt4all-chat/src/chatapi.h
@ -12,9 +12,10 @@
 #include <cstddef>
 #include <cstdint>
 #include <stdexcept>
 #include <functional>
 #include <stdexcept>
 #include <string>
 #include <string_view>
 #include <vector>
 class QNetworkAccessManager;
@ -72,7 +73,7 @@ public:
                bool allowContextShift,
                PromptContext &ctx,
                bool special,
-                std::string *fakeReply) override;
+                std::optional<std::string_view> fakeReply) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;
@ -97,7 +98,7 @@ protected:
    // them as they are only called from the default implementation of 'prompt' which we override and
    // completely replace
-    std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) override
+    std::vector<Token> tokenize(PromptContext &ctx, std::string_view str, bool special) override
    {
        (void)ctx;
        (void)str;
--- a/gpt4all-chat/src/chatllm.cpp
+++ b/gpt4all-chat/src/chatllm.cpp
@ -626,16 +626,16 @@ void ChatLLM::regenerateResponse()
    m_ctx.tokens.erase(m_ctx.tokens.end() - m_promptResponseTokens, m_ctx.tokens.end());
    m_promptResponseTokens = 0;
    m_promptTokens = 0;
-    m_response = std::string();
+    m_response = m_trimmedResponse = std::string();
-    emit responseChanged(QString::fromStdString(m_response));
+    emit responseChanged(QString::fromStdString(m_trimmedResponse));
 }
 void ChatLLM::resetResponse()
 {
    m_promptTokens = 0;
    m_promptResponseTokens = 0;
-    m_response = std::string();
+    m_response = m_trimmedResponse = std::string();
-    emit responseChanged(QString::fromStdString(m_response));
+    emit responseChanged(QString::fromStdString(m_trimmedResponse));
 }
 void ChatLLM::resetContext()
@ -645,9 +645,12 @@ void ChatLLM::resetContext()
    m_ctx = LLModel::PromptContext();
 }
-QString ChatLLM::response() const
+QString ChatLLM::response(bool trim) const
 {
-    return QString::fromStdString(remove_leading_whitespace(m_response));
+    std::string resp = m_response;
    if (trim)
        resp = remove_leading_whitespace(resp);
    return QString::fromStdString(resp);
 }
 ModelInfo ChatLLM::modelInfo() const
@ -705,7 +708,8 @@ bool ChatLLM::handleResponse(int32_t token, const std::string &response)
    // check for error
    if (token < 0) {
        m_response.append(response);
-        emit responseChanged(QString::fromStdString(remove_leading_whitespace(m_response)));
+        m_trimmedResponse = remove_leading_whitespace(m_response);
        emit responseChanged(QString::fromStdString(m_trimmedResponse));
        return false;
    }
@ -715,7 +719,8 @@ bool ChatLLM::handleResponse(int32_t token, const std::string &response)
    m_timer->inc();
    Q_ASSERT(!response.empty());
    m_response.append(response);
-    emit responseChanged(QString::fromStdString(remove_leading_whitespace(m_response)));
+    m_trimmedResponse = remove_leading_whitespace(m_response);
    emit responseChanged(QString::fromStdString(m_trimmedResponse));
    return !m_stopGenerating;
 }
@ -741,7 +746,7 @@ bool ChatLLM::prompt(const QList<QString> &collectionList, const QString &prompt
 bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString &prompt, const QString &promptTemplate,
    int32_t n_predict, int32_t top_k, float top_p, float min_p, float temp, int32_t n_batch, float repeat_penalty,
-    int32_t repeat_penalty_tokens)
+    int32_t repeat_penalty_tokens, std::optional<QString> fakeReply)
 {
    if (!isModelLoaded())
        return false;
@ -751,7 +756,7 @@ bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString
    QList<ResultInfo> databaseResults;
    const int retrievalSize = MySettings::globalInstance()->localDocsRetrievalSize();
-    if (!collectionList.isEmpty()) {
+    if (!fakeReply && !collectionList.isEmpty()) {
        emit requestRetrieveFromDB(collectionList, prompt, retrievalSize, &databaseResults); // blocks
        emit databaseResultsChanged(databaseResults);
    }
@ -797,7 +802,8 @@ bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString
        m_ctx.n_predict = old_n_predict; // now we are ready for a response
    }
    m_llModelInfo.model->prompt(prompt.toStdString(), promptTemplate.toStdString(), promptFunc, responseFunc,
-                                /*allowContextShift*/ true, m_ctx);
+                                /*allowContextShift*/ true, m_ctx, false,
                                fakeReply.transform(std::mem_fn(&QString::toStdString)));
 #if defined(DEBUG)
    printf("\n");
    fflush(stdout);
@ -805,9 +811,9 @@ bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString
    m_timer->stop();
    qint64 elapsed = totalTime.elapsed();
    std::string trimmed = trim_whitespace(m_response);
-    if (trimmed != m_response) {
+    if (trimmed != m_trimmedResponse) {
-        m_response = trimmed;
+        m_trimmedResponse = trimmed;
-        emit responseChanged(QString::fromStdString(m_response));
+        emit responseChanged(QString::fromStdString(m_trimmedResponse));
    }
    SuggestionMode mode = MySettings::globalInstance()->suggestionMode();
@ -1078,6 +1084,7 @@ bool ChatLLM::deserialize(QDataStream &stream, int version, bool deserializeKV,
    QString response;
    stream >> response;
    m_response = response.toStdString();
    m_trimmedResponse = trim_whitespace(m_response);
    QString nameResponse;
    stream >> nameResponse;
    m_nameResponse = nameResponse.toStdString();
@ -1306,10 +1313,9 @@ void ChatLLM::processRestoreStateFromText()
        auto &response = *it++;
        Q_ASSERT(response.first != "Prompt: ");
        auto responseText = response.second.toStdString();
        m_llModelInfo.model->prompt(prompt.second.toStdString(), promptTemplate.toStdString(), promptFunc, nullptr,
-                                    /*allowContextShift*/ true, m_ctx, false, &responseText);
+                                    /*allowContextShift*/ true, m_ctx, false, response.second.toUtf8().constData());
    }
    if (!m_stopGenerating) {
--- a/gpt4all-chat/src/chatllm.h
+++ b/gpt4all-chat/src/chatllm.h
@ -116,7 +116,7 @@ public:
    void setForceUnloadModel(bool b) { m_forceUnloadModel = b; }
    void setMarkedForDeletion(bool b) { m_markedForDeletion = b; }
-    QString response() const;
+    QString response(bool trim = true) const;
    ModelInfo modelInfo() const;
    void setModelInfo(const ModelInfo &info);
@ -198,7 +198,7 @@ Q_SIGNALS:
 protected:
    bool promptInternal(const QList<QString> &collectionList, const QString &prompt, const QString &promptTemplate,
        int32_t n_predict, int32_t top_k, float top_p, float min_p, float temp, int32_t n_batch, float repeat_penalty,
-        int32_t repeat_penalty_tokens);
+        int32_t repeat_penalty_tokens, std::optional<QString> fakeReply = {});
    bool handlePrompt(int32_t token);
    bool handleResponse(int32_t token, const std::string &response);
    bool handleNamePrompt(int32_t token);
@ -221,6 +221,7 @@ private:
    bool loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadProps);
    std::string m_response;
    std::string m_trimmedResponse;
    std::string m_nameResponse;
    QString m_questionResponse;
    LLModelInfo m_llModelInfo;
--- a/gpt4all-chat/src/localdocsmodel.h
+++ b/gpt4all-chat/src/localdocsmodel.h
@ -20,24 +20,25 @@ class LocalDocsCollectionsModel : public QSortFilterProxyModel
    Q_OBJECT
    Q_PROPERTY(int count READ count NOTIFY countChanged)
    Q_PROPERTY(int updatingCount READ updatingCount NOTIFY updatingCountChanged)
 public:
    explicit LocalDocsCollectionsModel(QObject *parent);
    int count() const { return rowCount(); }
    int updatingCount() const;
 public Q_SLOTS:
    int count() const { return rowCount(); }
    void setCollections(const QList<QString> &collections);
    int updatingCount() const;
 Q_SIGNALS:
    void countChanged();
    void updatingCountChanged();
 private Q_SLOT:
    void maybeTriggerUpdatingCountChanged();
 protected:
    bool filterAcceptsRow(int sourceRow, const QModelIndex &sourceParent) const override;
 private Q_SLOTS:
    void maybeTriggerUpdatingCountChanged();
 private:
    QList<QString> m_collections;
    int m_updatingCount = 0;
--- a/gpt4all-chat/src/modellist.h
+++ b/gpt4all-chat/src/modellist.h
@ -18,10 +18,12 @@
 #include <QVector>
 #include <Qt>
 #include <QtGlobal>
-#include <QtQml>
+
 #include <utility>
 using namespace Qt::Literals::StringLiterals;
 struct ModelInfo {
    Q_GADGET
    Q_PROPERTY(QString id READ id WRITE setId)
@ -523,7 +525,7 @@ private:
 protected:
    explicit ModelList();
-    ~ModelList() { for (auto *model: m_models) { delete model; } }
+    ~ModelList() override { for (auto *model: std::as_const(m_models)) { delete model; } }
    friend class MyModelList;
 };
--- a/gpt4all-chat/src/mysettings.h
+++ b/gpt4all-chat/src/mysettings.h
@ -8,6 +8,7 @@
 #include <QSettings>
 #include <QString>
 #include <QStringList>
 #include <QTranslator>
 #include <QVector>
 #include <cstdint>
--- a/gpt4all-chat/src/server.cpp
+++ b/gpt4all-chat/src/server.cpp
--- a/gpt4all-chat/src/server.h
+++ b/gpt4all-chat/src/server.h
@ -4,22 +4,29 @@
 #include "chatllm.h"
 #include "database.h"
-#include <QHttpServerRequest>
+#include <QHttpServer>
 #include <QHttpServerResponse>
-#include <QObject>
+#include <QJsonObject>
 #include <QList>
 #include <QObject>
 #include <QString>
 #include <memory>
 #include <optional>
 #include <utility>
 class Chat;
-class QHttpServer;
+class ChatRequest;
 class CompletionRequest;
 class Server : public ChatLLM
 {
    Q_OBJECT
 public:
-    Server(Chat *parent);
+    explicit Server(Chat *chat);
-    virtual ~Server();
+    ~Server() override = default;
 public Q_SLOTS:
    void start();
@ -27,14 +34,17 @@ public Q_SLOTS:
 Q_SIGNALS:
    void requestServerNewPromptResponsePair(const QString &prompt);
 private:
    auto handleCompletionRequest(const CompletionRequest &request) -> std::pair<QHttpServerResponse, std::optional<QJsonObject>>;
    auto handleChatRequest(const ChatRequest &request) -> std::pair<QHttpServerResponse, std::optional<QJsonObject>>;
 private Q_SLOTS:
    QHttpServerResponse handleCompletionRequest(const QHttpServerRequest &request, bool isChat);
    void handleDatabaseResultsChanged(const QList<ResultInfo> &results) { m_databaseResults = results; }
    void handleCollectionListChanged(const QList<QString> &collectionList) { m_collections = collectionList; }
 private:
    Chat *m_chat;
-    QHttpServer *m_server;
+    std::unique_ptr<QHttpServer> m_server;
    QList<ResultInfo> m_databaseResults;
    QList<QString> m_collections;
 };
		`@ -1 +1 @@`
			`Subproject commit 443665aec4721ecf57df8162e7e093a0cd674a76`				`Subproject commit ced74fbad4b258507f3ec06e77eec9445583511a`
		`@ -0,0 +1 @@`
							`Subproject commit 0c9fce2ffefecfdce794e1859584e25877b7b592`