From 38c61493d25617641ca9f63333c82a63ddd1ab2c Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Thu, 25 Jan 2024 16:58:46 -0500 Subject: [PATCH] backend: update to latest commit of llama.cpp Vulkan PR Signed-off-by: Jared Van Bortel --- gpt4all-backend/bert.cpp | 11 +--- gpt4all-backend/gptj.cpp | 6 +- gpt4all-backend/llama.cpp-mainline | 2 +- gpt4all-backend/llama.cpp.cmake | 3 +- gpt4all-backend/llamamodel.cpp | 102 ++++++++++++++--------------- gpt4all-backend/llamamodel_impl.h | 2 +- gpt4all-backend/llmodel.h | 26 ++++++-- gpt4all-backend/llmodel_c.cpp | 13 ++-- gpt4all-backend/llmodel_shared.h | 45 ------------- 9 files changed, 85 insertions(+), 125 deletions(-) diff --git a/gpt4all-backend/bert.cpp b/gpt4all-backend/bert.cpp index 2424d72c..e2d21265 100644 --- a/gpt4all-backend/bert.cpp +++ b/gpt4all-backend/bert.cpp @@ -381,10 +381,9 @@ void bert_eval( struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q); // KQ = soft_max(KQ / sqrt(head width)) - KQ = ggml_soft_max(ctx0, - ggml_scale(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f / sqrt((float)d_head)))); + KQ = ggml_soft_max( + ctx0, ggml_scale(ctx0, KQ, 1.0f / sqrt((float)d_head)) + ); V = ggml_cont(ctx0, ggml_transpose(ctx0, V)); struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ); @@ -490,10 +489,6 @@ struct bert_ctx * bert_load_from_file(const char *fname) #endif bert_ctx * new_bert = new bert_ctx; -#if defined(GGML_USE_KOMPUTE) - new_bert->buf_compute.force_cpu = true; - new_bert->work_buf.force_cpu = true; -#endif bert_model & model = new_bert->model; bert_vocab & vocab = new_bert->vocab; diff --git a/gpt4all-backend/gptj.cpp b/gpt4all-backend/gptj.cpp index 074ef5dc..6303ed84 100644 --- a/gpt4all-backend/gptj.cpp +++ b/gpt4all-backend/gptj.cpp @@ -414,11 +414,7 @@ bool gptj_eval( struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); // KQ_scaled = KQ / sqrt(n_embd/n_head) - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) - ); + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp-mainline index 01307d86..15da9c89 160000 --- a/gpt4all-backend/llama.cpp-mainline +++ b/gpt4all-backend/llama.cpp-mainline @@ -1 +1 @@ -Subproject commit 01307d86bbe980128308c36b64c494fb9dbaa5bf +Subproject commit 15da9c89f14a6cd44a4b45d65bf1f02d5762fe90 diff --git a/gpt4all-backend/llama.cpp.cmake b/gpt4all-backend/llama.cpp.cmake index 8a2ce5ee..f8aa532f 100644 --- a/gpt4all-backend/llama.cpp.cmake +++ b/gpt4all-backend/llama.cpp.cmake @@ -175,6 +175,7 @@ if (LLAMA_KOMPUTE) DEPENDS ${LLAMA_DIR}/${source} ${LLAMA_DIR}/kompute-shaders/common.comp ${LLAMA_DIR}/kompute-shaders/op_getrows.comp + ${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp ${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n.comp COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/${source} COMMENT "Compiling ${source} to ${source}.spv" @@ -231,7 +232,6 @@ if (LLAMA_KOMPUTE) kompute-shaders/op_add.comp kompute-shaders/op_addrow.comp kompute-shaders/op_mul.comp - kompute-shaders/op_mulrow.comp kompute-shaders/op_silu.comp kompute-shaders/op_relu.comp kompute-shaders/op_gelu.comp @@ -264,7 +264,6 @@ if (LLAMA_KOMPUTE) shaderop_add.h shaderop_addrow.h shaderop_mul.h - shaderop_mulrow.h shaderop_silu.h shaderop_relu.h shaderop_gelu.h diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp index d19d7b54..4152dd19 100644 --- a/gpt4all-backend/llamamodel.cpp +++ b/gpt4all-backend/llamamodel.cpp @@ -96,6 +96,7 @@ static int llama_sample_top_p_top_k( struct LLamaPrivate { const std::string modelPath; bool modelLoaded; + int device = -1; llama_model *model = nullptr; llama_context *ctx = nullptr; llama_model_params model_params; @@ -167,24 +168,17 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx) if (llama_verbose()) { std::cerr << "llama.cpp: using Metal" << std::endl; } - // metal always runs the whole model if n_gpu_layers is not 0, at least - // currently - d_ptr->model_params.n_gpu_layers = 1; -#endif -#ifdef GGML_USE_KOMPUTE - if (ggml_vk_has_device()) { - // vulkan always runs the whole model if n_gpu_layers is not 0, at least - // currently - d_ptr->model_params.n_gpu_layers = 1; + d_ptr->model_params.n_gpu_layers = 100; +#elif defined(GGML_USE_KOMPUTE) + if (d_ptr->device != -1) { + d_ptr->model_params.main_gpu = d_ptr->device; + d_ptr->model_params.n_gpu_layers = 100; } #endif d_ptr->model = llama_load_model_from_file_gpt4all(modelPath.c_str(), &d_ptr->model_params); if (!d_ptr->model) { -#ifdef GGML_USE_KOMPUTE - // Explicitly free the device so next load it doesn't use it - ggml_vk_free_device(); -#endif + d_ptr->device = -1; std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl; return false; } @@ -214,10 +208,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx) d_ptr->ctx = llama_new_context_with_model(d_ptr->model, d_ptr->ctx_params); if (!d_ptr->ctx) { -#ifdef GGML_USE_KOMPUTE - // Explicitly free the device so next load it doesn't use it - ggml_vk_free_device(); -#endif + d_ptr->device = -1; std::cerr << "LLAMA ERROR: failed to init context for model " << modelPath << std::endl; return false; } @@ -225,7 +216,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx) d_ptr->end_tokens = {llama_token_eos(d_ptr->model)}; #ifdef GGML_USE_KOMPUTE - if (ggml_vk_has_device()) { + if (usingGPUDevice() && ggml_vk_has_device()) { std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl; } #endif @@ -339,62 +330,70 @@ const std::vector &LLamaModel::endTokens() const std::vector LLamaModel::availableGPUDevices(size_t memoryRequired) { #if defined(GGML_USE_KOMPUTE) - std::vector vkDevices = ggml_vk_available_devices(memoryRequired); + size_t count = 0; + auto * vkDevices = ggml_vk_available_devices(memoryRequired, &count); - std::vector devices; - for(const auto& vkDevice : vkDevices) { - LLModel::GPUDevice device; - device.index = vkDevice.index; - device.type = vkDevice.type; - device.heapSize = vkDevice.heapSize; - device.name = vkDevice.name; - device.vendor = vkDevice.vendor; + if (vkDevices) { + std::vector devices; + devices.reserve(count); - devices.push_back(device); + for (size_t i = 0; i < count; ++i) { + auto & dev = vkDevices[i]; + devices.emplace_back( + /* index = */ dev.index, + /* type = */ dev.type, + /* heapSize = */ dev.heapSize, + /* name = */ dev.name, + /* vendor = */ dev.vendor + ); + } + + free(vkDevices); + return devices; } - - return devices; -#else - return std::vector(); #endif + + return {}; } -bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string& device) +bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name) { #if defined(GGML_USE_KOMPUTE) - return ggml_vk_init_device(memoryRequired, device); + ggml_vk_device device; + bool ok = ggml_vk_get_device(&device, memoryRequired, name.c_str()); + if (ok) { + d_ptr->device = device.index; + return true; + } #else - return false; + (void)memoryRequired; + (void)name; #endif + return false; } bool LLamaModel::initializeGPUDevice(const LLModel::GPUDevice &device, std::string *unavail_reason) { - bool result = false; #if defined(GGML_USE_KOMPUTE) - ggml_vk_device vkDevice; - vkDevice.index = device.index; - vkDevice.type = device.type; - vkDevice.heapSize = device.heapSize; - vkDevice.name = device.name; - vkDevice.vendor = device.vendor; - result = ggml_vk_init_device(vkDevice); - if (!result && unavail_reason) { - *unavail_reason = "failed to init GPU"; - } + (void)unavail_reason; + d_ptr->device = device.index; + return true; #else + (void)device; if (unavail_reason) { *unavail_reason = "built without Kompute"; } + return false; #endif - return result; } bool LLamaModel::initializeGPUDevice(int device) { #if defined(GGML_USE_KOMPUTE) - return ggml_vk_init_device(device); + d_ptr->device = device; + return true; #else + (void)device; return false; #endif } @@ -402,7 +401,7 @@ bool LLamaModel::initializeGPUDevice(int device) bool LLamaModel::hasGPUDevice() { #if defined(GGML_USE_KOMPUTE) - return ggml_vk_has_device(); + return d_ptr->device != -1; #else return false; #endif @@ -411,11 +410,12 @@ bool LLamaModel::hasGPUDevice() bool LLamaModel::usingGPUDevice() { #if defined(GGML_USE_KOMPUTE) - return ggml_vk_using_vulkan(); + return hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0; #elif defined(GGML_USE_METAL) return true; -#endif +#else return false; +#endif } std::string get_arch_name(gguf_context *ctx_gguf) { diff --git a/gpt4all-backend/llamamodel_impl.h b/gpt4all-backend/llamamodel_impl.h index c32b2413..7c097637 100644 --- a/gpt4all-backend/llamamodel_impl.h +++ b/gpt4all-backend/llamamodel_impl.h @@ -26,7 +26,7 @@ public: void setThreadCount(int32_t n_threads) override; int32_t threadCount() const override; std::vector availableGPUDevices(size_t memoryRequired) override; - bool initializeGPUDevice(size_t memoryRequired, const std::string& device) override; + bool initializeGPUDevice(size_t memoryRequired, const std::string& name) override; bool initializeGPUDevice(const GPUDevice &device, std::string *unavail_reason) override; bool initializeGPUDevice(int device) override; bool hasGPUDevice() override; diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h index c0b8bca9..a5ae2d54 100644 --- a/gpt4all-backend/llmodel.h +++ b/gpt4all-backend/llmodel.h @@ -17,11 +17,14 @@ public: using Token = int32_t; struct GPUDevice { - int index = 0; - int type = 0; - size_t heapSize = 0; + int index; + int type; + size_t heapSize; std::string name; std::string vendor; + + GPUDevice(int index, int type, size_t heapSize, std::string name, std::string vendor): + index(index), type(type), heapSize(heapSize), name(std::move(name)), vendor(std::move(vendor)) {} }; class Implementation { @@ -98,14 +101,25 @@ public: return *m_implementation; } - virtual std::vector availableGPUDevices(size_t /*memoryRequired*/) { return std::vector(); } - virtual bool initializeGPUDevice(size_t /*memoryRequired*/, const std::string& /*device*/) { return false; } - virtual bool initializeGPUDevice(const GPUDevice &/*device*/, std::string *unavail_reason = nullptr) { + virtual std::vector availableGPUDevices(size_t memoryRequired) { + (void)memoryRequired; + return {}; + } + + virtual bool initializeGPUDevice(size_t memoryRequired, const std::string& name) { + (void)memoryRequired; + (void)name; + return false; + } + + virtual bool initializeGPUDevice(const GPUDevice & device, std::string *unavail_reason = nullptr) { + (void)device; if (unavail_reason) { *unavail_reason = "model has no GPU support"; } return false; } + virtual bool initializeGPUDevice(int /*device*/) { return false; } virtual bool hasGPUDevice() { return false; } virtual bool usingGPUDevice() { return false; } diff --git a/gpt4all-backend/llmodel_c.cpp b/gpt4all-backend/llmodel_c.cpp index c8af2ca3..bfeca69c 100644 --- a/gpt4all-backend/llmodel_c.cpp +++ b/gpt4all-backend/llmodel_c.cpp @@ -230,12 +230,13 @@ bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryReq bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device) { - LLModel::GPUDevice d; - d.index = device->index; - d.type = device->type; - d.heapSize = device->heapSize; - d.name = device->name; - d.vendor = device->vendor; + LLModel::GPUDevice d( + /* index = */ device->index, + /* type = */ device->type, + /* heapSize = */ device->heapSize, + /* name = */ device->name, + /* vendor = */ device->vendor + ); LLModelWrapper *wrapper = reinterpret_cast(model); return wrapper->llModel->initializeGPUDevice(d); } diff --git a/gpt4all-backend/llmodel_shared.h b/gpt4all-backend/llmodel_shared.h index b7b1a837..aa132849 100644 --- a/gpt4all-backend/llmodel_shared.h +++ b/gpt4all-backend/llmodel_shared.h @@ -4,50 +4,6 @@ #include #include -#if defined(GGML_USE_KOMPUTE) -#include "ggml-kompute.h" -struct llm_buffer { - uint8_t * addr = NULL; - size_t size = 0; - ggml_vk_memory memory; - bool force_cpu = false; - - llm_buffer() = default; - - void resize(size_t size) { - free(); - - if (!ggml_vk_has_device() || force_cpu) { - this->addr = new uint8_t[size]; - this->size = size; - } else { - this->memory = ggml_vk_allocate(size); - this->addr = (uint8_t*)memory.data; - this->size = size; - } - } - - void free() { - if (!memory.primaryMemory) { - delete[] addr; - } else if (memory.data) { - ggml_vk_free_memory(memory); - } - this->addr = NULL; - this->size = 0; - } - - ~llm_buffer() { - free(); - } - - // disable copy and move - llm_buffer(const llm_buffer&) = delete; - llm_buffer(llm_buffer&&) = delete; - llm_buffer& operator=(const llm_buffer&) = delete; - llm_buffer& operator=(llm_buffer&&) = delete; -}; -#else struct llm_buffer { uint8_t * addr = NULL; size_t size = 0; @@ -62,7 +18,6 @@ struct llm_buffer { delete[] addr; } }; -#endif struct llm_kv_cache { struct ggml_tensor * k;