From 38c61493d25617641ca9f63333c82a63ddd1ab2c Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 25 Jan 2024 16:58:46 -0500
Subject: [PATCH] backend: update to latest commit of llama.cpp Vulkan PR

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
---
 gpt4all-backend/bert.cpp           |  11 +---
 gpt4all-backend/gptj.cpp           |   6 +-
 gpt4all-backend/llama.cpp-mainline |   2 +-
 gpt4all-backend/llama.cpp.cmake    |   3 +-
 gpt4all-backend/llamamodel.cpp     | 102 ++++++++++++++---------------
 gpt4all-backend/llamamodel_impl.h  |   2 +-
 gpt4all-backend/llmodel.h          |  26 ++++++--
 gpt4all-backend/llmodel_c.cpp      |  13 ++--
 gpt4all-backend/llmodel_shared.h   |  45 -------------
 9 files changed, 85 insertions(+), 125 deletions(-)

diff --git a/gpt4all-backend/bert.cpp b/gpt4all-backend/bert.cpp
index 2424d72c..e2d21265 100644
--- a/gpt4all-backend/bert.cpp
+++ b/gpt4all-backend/bert.cpp
@@ -381,10 +381,9 @@ void bert_eval(
 
             struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
             // KQ = soft_max(KQ / sqrt(head width))
-            KQ = ggml_soft_max(ctx0,
-                               ggml_scale(ctx0,
-                                          KQ,
-                                          ggml_new_f32(ctx0, 1.0f / sqrt((float)d_head))));
+            KQ = ggml_soft_max(
+                ctx0, ggml_scale(ctx0, KQ, 1.0f / sqrt((float)d_head))
+            );
 
             V = ggml_cont(ctx0, ggml_transpose(ctx0, V));
             struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
@@ -490,10 +489,6 @@ struct bert_ctx * bert_load_from_file(const char *fname)
 #endif
 
     bert_ctx * new_bert = new bert_ctx;
-#if defined(GGML_USE_KOMPUTE)
-    new_bert->buf_compute.force_cpu = true;
-    new_bert->work_buf.force_cpu = true;
-#endif
 
     bert_model & model = new_bert->model;
     bert_vocab & vocab = new_bert->vocab;
diff --git a/gpt4all-backend/gptj.cpp b/gpt4all-backend/gptj.cpp
index 074ef5dc..6303ed84 100644
--- a/gpt4all-backend/gptj.cpp
+++ b/gpt4all-backend/gptj.cpp
@@ -414,11 +414,7 @@ bool gptj_eval(
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
 
             // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
-                        );
+            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head));
 
             // KQ_masked = mask_past(KQ_scaled)
             struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp-mainline
index 01307d86..15da9c89 160000
--- a/gpt4all-backend/llama.cpp-mainline
+++ b/gpt4all-backend/llama.cpp-mainline
@@ -1 +1 @@
-Subproject commit 01307d86bbe980128308c36b64c494fb9dbaa5bf
+Subproject commit 15da9c89f14a6cd44a4b45d65bf1f02d5762fe90
diff --git a/gpt4all-backend/llama.cpp.cmake b/gpt4all-backend/llama.cpp.cmake
index 8a2ce5ee..f8aa532f 100644
--- a/gpt4all-backend/llama.cpp.cmake
+++ b/gpt4all-backend/llama.cpp.cmake
@@ -175,6 +175,7 @@ if (LLAMA_KOMPUTE)
             DEPENDS ${LLAMA_DIR}/${source}
               ${LLAMA_DIR}/kompute-shaders/common.comp
               ${LLAMA_DIR}/kompute-shaders/op_getrows.comp
+              ${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
               ${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n.comp
             COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/${source}
             COMMENT "Compiling ${source} to ${source}.spv"
@@ -231,7 +232,6 @@ if (LLAMA_KOMPUTE)
           kompute-shaders/op_add.comp
           kompute-shaders/op_addrow.comp
           kompute-shaders/op_mul.comp
-          kompute-shaders/op_mulrow.comp
           kompute-shaders/op_silu.comp
           kompute-shaders/op_relu.comp
           kompute-shaders/op_gelu.comp
@@ -264,7 +264,6 @@ if (LLAMA_KOMPUTE)
           shaderop_add.h
           shaderop_addrow.h
           shaderop_mul.h
-          shaderop_mulrow.h
           shaderop_silu.h
           shaderop_relu.h
           shaderop_gelu.h
diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
index d19d7b54..4152dd19 100644
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -96,6 +96,7 @@ static int llama_sample_top_p_top_k(
 struct LLamaPrivate {
     const std::string modelPath;
     bool modelLoaded;
+    int device = -1;
     llama_model *model = nullptr;
     llama_context *ctx = nullptr;
     llama_model_params model_params;
@@ -167,24 +168,17 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)
     if (llama_verbose()) {
         std::cerr << "llama.cpp: using Metal" << std::endl;
     }
-    // metal always runs the whole model if n_gpu_layers is not 0, at least
-    // currently
-    d_ptr->model_params.n_gpu_layers = 1;
-#endif
-#ifdef GGML_USE_KOMPUTE
-    if (ggml_vk_has_device()) {
-        // vulkan always runs the whole model if n_gpu_layers is not 0, at least
-        // currently
-        d_ptr->model_params.n_gpu_layers = 1;
+    d_ptr->model_params.n_gpu_layers = 100;
+#elif defined(GGML_USE_KOMPUTE)
+    if (d_ptr->device != -1) {
+        d_ptr->model_params.main_gpu = d_ptr->device;
+        d_ptr->model_params.n_gpu_layers = 100;
     }
 #endif
 
     d_ptr->model = llama_load_model_from_file_gpt4all(modelPath.c_str(), &d_ptr->model_params);
     if (!d_ptr->model) {
-#ifdef GGML_USE_KOMPUTE
-        // Explicitly free the device so next load it doesn't use it
-        ggml_vk_free_device();
-#endif
+        d_ptr->device = -1;
         std::cerr << "LLAMA ERROR: failed to load model from " <<  modelPath << std::endl;
         return false;
     }
@@ -214,10 +208,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)
 
     d_ptr->ctx = llama_new_context_with_model(d_ptr->model, d_ptr->ctx_params);
     if (!d_ptr->ctx) {
-#ifdef GGML_USE_KOMPUTE
-        // Explicitly free the device so next load it doesn't use it
-        ggml_vk_free_device();
-#endif
+        d_ptr->device = -1;
         std::cerr << "LLAMA ERROR: failed to init context for model " <<  modelPath << std::endl;
         return false;
     }
@@ -225,7 +216,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)
     d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};
 
 #ifdef GGML_USE_KOMPUTE
-    if (ggml_vk_has_device()) {
+    if (usingGPUDevice() && ggml_vk_has_device()) {
         std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
     }
 #endif
@@ -339,62 +330,70 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const
 std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired)
 {
 #if defined(GGML_USE_KOMPUTE)
-    std::vector<ggml_vk_device> vkDevices = ggml_vk_available_devices(memoryRequired);
+    size_t count = 0;
+    auto * vkDevices = ggml_vk_available_devices(memoryRequired, &count);
 
-    std::vector<LLModel::GPUDevice> devices;
-    for(const auto& vkDevice : vkDevices) {
-        LLModel::GPUDevice device;
-        device.index = vkDevice.index;
-        device.type = vkDevice.type;
-        device.heapSize = vkDevice.heapSize;
-        device.name = vkDevice.name;
-        device.vendor = vkDevice.vendor;
+    if (vkDevices) {
+        std::vector<LLModel::GPUDevice> devices;
+        devices.reserve(count);
 
-        devices.push_back(device);
+        for (size_t i = 0; i < count; ++i) {
+            auto & dev = vkDevices[i];
+            devices.emplace_back(
+                /* index    = */ dev.index,
+                /* type     = */ dev.type,
+                /* heapSize = */ dev.heapSize,
+                /* name     = */ dev.name,
+                /* vendor   = */ dev.vendor
+            );
+        }
+
+        free(vkDevices);
+        return devices;
     }
-
-    return devices;
-#else
-    return std::vector<LLModel::GPUDevice>();
 #endif
+
+    return {};
 }
 
-bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string& device)
+bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name)
 {
 #if defined(GGML_USE_KOMPUTE)
-    return ggml_vk_init_device(memoryRequired, device);
+    ggml_vk_device device;
+    bool ok = ggml_vk_get_device(&device, memoryRequired, name.c_str());
+    if (ok) {
+        d_ptr->device = device.index;
+        return true;
+    }
 #else
-    return false;
+    (void)memoryRequired;
+    (void)name;
 #endif
+    return false;
 }
 
 bool LLamaModel::initializeGPUDevice(const LLModel::GPUDevice &device, std::string *unavail_reason)
 {
-    bool result = false;
 #if defined(GGML_USE_KOMPUTE)
-    ggml_vk_device vkDevice;
-    vkDevice.index = device.index;
-    vkDevice.type = device.type;
-    vkDevice.heapSize = device.heapSize;
-    vkDevice.name = device.name;
-    vkDevice.vendor = device.vendor;
-    result = ggml_vk_init_device(vkDevice);
-    if (!result && unavail_reason) {
-        *unavail_reason = "failed to init GPU";
-    }
+    (void)unavail_reason;
+    d_ptr->device = device.index;
+    return true;
 #else
+    (void)device;
     if (unavail_reason) {
         *unavail_reason = "built without Kompute";
     }
+    return false;
 #endif
-    return result;
 }
 
 bool LLamaModel::initializeGPUDevice(int device)
 {
 #if defined(GGML_USE_KOMPUTE)
-    return ggml_vk_init_device(device);
+    d_ptr->device = device;
+    return true;
 #else
+    (void)device;
     return false;
 #endif
 }
@@ -402,7 +401,7 @@ bool LLamaModel::initializeGPUDevice(int device)
 bool LLamaModel::hasGPUDevice()
 {
 #if defined(GGML_USE_KOMPUTE)
-    return ggml_vk_has_device();
+    return d_ptr->device != -1;
 #else
     return false;
 #endif
@@ -411,11 +410,12 @@ bool LLamaModel::hasGPUDevice()
 bool LLamaModel::usingGPUDevice()
 {
 #if defined(GGML_USE_KOMPUTE)
-    return ggml_vk_using_vulkan();
+    return hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
 #elif defined(GGML_USE_METAL)
     return true;
-#endif
+#else
     return false;
+#endif
 }
 
 std::string get_arch_name(gguf_context *ctx_gguf) {
diff --git a/gpt4all-backend/llamamodel_impl.h b/gpt4all-backend/llamamodel_impl.h
index c32b2413..7c097637 100644
--- a/gpt4all-backend/llamamodel_impl.h
+++ b/gpt4all-backend/llamamodel_impl.h
@@ -26,7 +26,7 @@ public:
     void setThreadCount(int32_t n_threads) override;
     int32_t threadCount() const override;
     std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) override;
-    bool initializeGPUDevice(size_t memoryRequired, const std::string& device) override;
+    bool initializeGPUDevice(size_t memoryRequired, const std::string& name) override;
     bool initializeGPUDevice(const GPUDevice &device, std::string *unavail_reason) override;
     bool initializeGPUDevice(int device) override;
     bool hasGPUDevice() override;
diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h
index c0b8bca9..a5ae2d54 100644
--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@@ -17,11 +17,14 @@ public:
     using Token = int32_t;
 
     struct GPUDevice {
-        int index = 0;
-        int type = 0;
-        size_t heapSize = 0;
+        int index;
+        int type;
+        size_t heapSize;
         std::string name;
         std::string vendor;
+
+        GPUDevice(int index, int type, size_t heapSize, std::string name, std::string vendor):
+            index(index), type(type), heapSize(heapSize), name(std::move(name)), vendor(std::move(vendor)) {}
     };
 
     class Implementation {
@@ -98,14 +101,25 @@ public:
         return *m_implementation;
     }
 
-    virtual std::vector<GPUDevice> availableGPUDevices(size_t /*memoryRequired*/) { return std::vector<GPUDevice>(); }
-    virtual bool initializeGPUDevice(size_t /*memoryRequired*/, const std::string& /*device*/) { return false; }
-    virtual bool initializeGPUDevice(const GPUDevice &/*device*/, std::string *unavail_reason = nullptr) {
+    virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) {
+        (void)memoryRequired;
+        return {};
+    }
+
+    virtual bool initializeGPUDevice(size_t memoryRequired, const std::string& name) {
+        (void)memoryRequired;
+        (void)name;
+        return false;
+    }
+
+    virtual bool initializeGPUDevice(const GPUDevice & device, std::string *unavail_reason = nullptr) {
+        (void)device;
         if (unavail_reason) {
             *unavail_reason = "model has no GPU support";
         }
         return false;
     }
+
     virtual bool initializeGPUDevice(int /*device*/) { return false; }
     virtual bool hasGPUDevice() { return false; }
     virtual bool usingGPUDevice() { return false; }
diff --git a/gpt4all-backend/llmodel_c.cpp b/gpt4all-backend/llmodel_c.cpp
index c8af2ca3..bfeca69c 100644
--- a/gpt4all-backend/llmodel_c.cpp
+++ b/gpt4all-backend/llmodel_c.cpp
@@ -230,12 +230,13 @@ bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryReq
 
 bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device)
 {
-    LLModel::GPUDevice d;
-    d.index = device->index;
-    d.type = device->type;
-    d.heapSize = device->heapSize;
-    d.name = device->name;
-    d.vendor = device->vendor;
+    LLModel::GPUDevice d(
+        /* index    = */ device->index,
+        /* type     = */ device->type,
+        /* heapSize = */ device->heapSize,
+        /* name     = */ device->name,
+        /* vendor   = */ device->vendor
+    );
     LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
     return wrapper->llModel->initializeGPUDevice(d);
 }
diff --git a/gpt4all-backend/llmodel_shared.h b/gpt4all-backend/llmodel_shared.h
index b7b1a837..aa132849 100644
--- a/gpt4all-backend/llmodel_shared.h
+++ b/gpt4all-backend/llmodel_shared.h
@@ -4,50 +4,6 @@
 #include <vector>
 #include <ggml.h>
 
-#if defined(GGML_USE_KOMPUTE)
-#include "ggml-kompute.h"
-struct llm_buffer {
-    uint8_t * addr = NULL;
-    size_t size = 0;
-    ggml_vk_memory memory;
-    bool force_cpu = false;
-
-    llm_buffer() = default;
-
-    void resize(size_t size) {
-        free();
-
-        if (!ggml_vk_has_device() || force_cpu) {
-            this->addr = new uint8_t[size];
-            this->size = size;
-        } else {
-            this->memory = ggml_vk_allocate(size);
-            this->addr = (uint8_t*)memory.data;
-            this->size = size;
-        }
-    }
-
-    void free() {
-        if (!memory.primaryMemory) {
-            delete[] addr;
-        } else if (memory.data) {
-            ggml_vk_free_memory(memory);
-        }
-        this->addr = NULL;
-        this->size = 0;
-    }
-
-    ~llm_buffer() {
-        free();
-    }
-
-    // disable copy and move
-    llm_buffer(const llm_buffer&) = delete;
-    llm_buffer(llm_buffer&&) = delete;
-    llm_buffer& operator=(const llm_buffer&) = delete;
-    llm_buffer& operator=(llm_buffer&&) = delete;
-};
-#else
 struct llm_buffer {
     uint8_t * addr = NULL;
     size_t size = 0;
@@ -62,7 +18,6 @@ struct llm_buffer {
         delete[] addr;
     }
 };
-#endif
 
 struct llm_kv_cache {
     struct ggml_tensor * k;