mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2024-10-01 01:06:10 -04:00
backend: update to latest commit of llama.cpp Vulkan PR
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
parent
29d2c936d1
commit
38c61493d2
@ -381,10 +381,9 @@ void bert_eval(
|
|||||||
|
|
||||||
struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
|
struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
|
||||||
// KQ = soft_max(KQ / sqrt(head width))
|
// KQ = soft_max(KQ / sqrt(head width))
|
||||||
KQ = ggml_soft_max(ctx0,
|
KQ = ggml_soft_max(
|
||||||
ggml_scale(ctx0,
|
ctx0, ggml_scale(ctx0, KQ, 1.0f / sqrt((float)d_head))
|
||||||
KQ,
|
);
|
||||||
ggml_new_f32(ctx0, 1.0f / sqrt((float)d_head))));
|
|
||||||
|
|
||||||
V = ggml_cont(ctx0, ggml_transpose(ctx0, V));
|
V = ggml_cont(ctx0, ggml_transpose(ctx0, V));
|
||||||
struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
|
struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
|
||||||
@ -490,10 +489,6 @@ struct bert_ctx * bert_load_from_file(const char *fname)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
bert_ctx * new_bert = new bert_ctx;
|
bert_ctx * new_bert = new bert_ctx;
|
||||||
#if defined(GGML_USE_KOMPUTE)
|
|
||||||
new_bert->buf_compute.force_cpu = true;
|
|
||||||
new_bert->work_buf.force_cpu = true;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
bert_model & model = new_bert->model;
|
bert_model & model = new_bert->model;
|
||||||
bert_vocab & vocab = new_bert->vocab;
|
bert_vocab & vocab = new_bert->vocab;
|
||||||
|
@ -414,11 +414,7 @@ bool gptj_eval(
|
|||||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||||
|
|
||||||
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
||||||
struct ggml_tensor * KQ_scaled =
|
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head));
|
||||||
ggml_scale(ctx0,
|
|
||||||
KQ,
|
|
||||||
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
|
|
||||||
);
|
|
||||||
|
|
||||||
// KQ_masked = mask_past(KQ_scaled)
|
// KQ_masked = mask_past(KQ_scaled)
|
||||||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
|
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit 01307d86bbe980128308c36b64c494fb9dbaa5bf
|
Subproject commit 15da9c89f14a6cd44a4b45d65bf1f02d5762fe90
|
@ -175,6 +175,7 @@ if (LLAMA_KOMPUTE)
|
|||||||
DEPENDS ${LLAMA_DIR}/${source}
|
DEPENDS ${LLAMA_DIR}/${source}
|
||||||
${LLAMA_DIR}/kompute-shaders/common.comp
|
${LLAMA_DIR}/kompute-shaders/common.comp
|
||||||
${LLAMA_DIR}/kompute-shaders/op_getrows.comp
|
${LLAMA_DIR}/kompute-shaders/op_getrows.comp
|
||||||
|
${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
|
||||||
${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n.comp
|
${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n.comp
|
||||||
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/${source}
|
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/${source}
|
||||||
COMMENT "Compiling ${source} to ${source}.spv"
|
COMMENT "Compiling ${source} to ${source}.spv"
|
||||||
@ -231,7 +232,6 @@ if (LLAMA_KOMPUTE)
|
|||||||
kompute-shaders/op_add.comp
|
kompute-shaders/op_add.comp
|
||||||
kompute-shaders/op_addrow.comp
|
kompute-shaders/op_addrow.comp
|
||||||
kompute-shaders/op_mul.comp
|
kompute-shaders/op_mul.comp
|
||||||
kompute-shaders/op_mulrow.comp
|
|
||||||
kompute-shaders/op_silu.comp
|
kompute-shaders/op_silu.comp
|
||||||
kompute-shaders/op_relu.comp
|
kompute-shaders/op_relu.comp
|
||||||
kompute-shaders/op_gelu.comp
|
kompute-shaders/op_gelu.comp
|
||||||
@ -264,7 +264,6 @@ if (LLAMA_KOMPUTE)
|
|||||||
shaderop_add.h
|
shaderop_add.h
|
||||||
shaderop_addrow.h
|
shaderop_addrow.h
|
||||||
shaderop_mul.h
|
shaderop_mul.h
|
||||||
shaderop_mulrow.h
|
|
||||||
shaderop_silu.h
|
shaderop_silu.h
|
||||||
shaderop_relu.h
|
shaderop_relu.h
|
||||||
shaderop_gelu.h
|
shaderop_gelu.h
|
||||||
|
@ -96,6 +96,7 @@ static int llama_sample_top_p_top_k(
|
|||||||
struct LLamaPrivate {
|
struct LLamaPrivate {
|
||||||
const std::string modelPath;
|
const std::string modelPath;
|
||||||
bool modelLoaded;
|
bool modelLoaded;
|
||||||
|
int device = -1;
|
||||||
llama_model *model = nullptr;
|
llama_model *model = nullptr;
|
||||||
llama_context *ctx = nullptr;
|
llama_context *ctx = nullptr;
|
||||||
llama_model_params model_params;
|
llama_model_params model_params;
|
||||||
@ -167,24 +168,17 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)
|
|||||||
if (llama_verbose()) {
|
if (llama_verbose()) {
|
||||||
std::cerr << "llama.cpp: using Metal" << std::endl;
|
std::cerr << "llama.cpp: using Metal" << std::endl;
|
||||||
}
|
}
|
||||||
// metal always runs the whole model if n_gpu_layers is not 0, at least
|
d_ptr->model_params.n_gpu_layers = 100;
|
||||||
// currently
|
#elif defined(GGML_USE_KOMPUTE)
|
||||||
d_ptr->model_params.n_gpu_layers = 1;
|
if (d_ptr->device != -1) {
|
||||||
#endif
|
d_ptr->model_params.main_gpu = d_ptr->device;
|
||||||
#ifdef GGML_USE_KOMPUTE
|
d_ptr->model_params.n_gpu_layers = 100;
|
||||||
if (ggml_vk_has_device()) {
|
|
||||||
// vulkan always runs the whole model if n_gpu_layers is not 0, at least
|
|
||||||
// currently
|
|
||||||
d_ptr->model_params.n_gpu_layers = 1;
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
d_ptr->model = llama_load_model_from_file_gpt4all(modelPath.c_str(), &d_ptr->model_params);
|
d_ptr->model = llama_load_model_from_file_gpt4all(modelPath.c_str(), &d_ptr->model_params);
|
||||||
if (!d_ptr->model) {
|
if (!d_ptr->model) {
|
||||||
#ifdef GGML_USE_KOMPUTE
|
d_ptr->device = -1;
|
||||||
// Explicitly free the device so next load it doesn't use it
|
|
||||||
ggml_vk_free_device();
|
|
||||||
#endif
|
|
||||||
std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
|
std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -214,10 +208,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)
|
|||||||
|
|
||||||
d_ptr->ctx = llama_new_context_with_model(d_ptr->model, d_ptr->ctx_params);
|
d_ptr->ctx = llama_new_context_with_model(d_ptr->model, d_ptr->ctx_params);
|
||||||
if (!d_ptr->ctx) {
|
if (!d_ptr->ctx) {
|
||||||
#ifdef GGML_USE_KOMPUTE
|
d_ptr->device = -1;
|
||||||
// Explicitly free the device so next load it doesn't use it
|
|
||||||
ggml_vk_free_device();
|
|
||||||
#endif
|
|
||||||
std::cerr << "LLAMA ERROR: failed to init context for model " << modelPath << std::endl;
|
std::cerr << "LLAMA ERROR: failed to init context for model " << modelPath << std::endl;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -225,7 +216,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)
|
|||||||
d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};
|
d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};
|
||||||
|
|
||||||
#ifdef GGML_USE_KOMPUTE
|
#ifdef GGML_USE_KOMPUTE
|
||||||
if (ggml_vk_has_device()) {
|
if (usingGPUDevice() && ggml_vk_has_device()) {
|
||||||
std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
|
std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -339,62 +330,70 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const
|
|||||||
std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired)
|
std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired)
|
||||||
{
|
{
|
||||||
#if defined(GGML_USE_KOMPUTE)
|
#if defined(GGML_USE_KOMPUTE)
|
||||||
std::vector<ggml_vk_device> vkDevices = ggml_vk_available_devices(memoryRequired);
|
size_t count = 0;
|
||||||
|
auto * vkDevices = ggml_vk_available_devices(memoryRequired, &count);
|
||||||
|
|
||||||
|
if (vkDevices) {
|
||||||
std::vector<LLModel::GPUDevice> devices;
|
std::vector<LLModel::GPUDevice> devices;
|
||||||
for(const auto& vkDevice : vkDevices) {
|
devices.reserve(count);
|
||||||
LLModel::GPUDevice device;
|
|
||||||
device.index = vkDevice.index;
|
|
||||||
device.type = vkDevice.type;
|
|
||||||
device.heapSize = vkDevice.heapSize;
|
|
||||||
device.name = vkDevice.name;
|
|
||||||
device.vendor = vkDevice.vendor;
|
|
||||||
|
|
||||||
devices.push_back(device);
|
for (size_t i = 0; i < count; ++i) {
|
||||||
|
auto & dev = vkDevices[i];
|
||||||
|
devices.emplace_back(
|
||||||
|
/* index = */ dev.index,
|
||||||
|
/* type = */ dev.type,
|
||||||
|
/* heapSize = */ dev.heapSize,
|
||||||
|
/* name = */ dev.name,
|
||||||
|
/* vendor = */ dev.vendor
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
free(vkDevices);
|
||||||
return devices;
|
return devices;
|
||||||
#else
|
}
|
||||||
return std::vector<LLModel::GPUDevice>();
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string& device)
|
bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name)
|
||||||
{
|
{
|
||||||
#if defined(GGML_USE_KOMPUTE)
|
#if defined(GGML_USE_KOMPUTE)
|
||||||
return ggml_vk_init_device(memoryRequired, device);
|
ggml_vk_device device;
|
||||||
|
bool ok = ggml_vk_get_device(&device, memoryRequired, name.c_str());
|
||||||
|
if (ok) {
|
||||||
|
d_ptr->device = device.index;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
return false;
|
(void)memoryRequired;
|
||||||
|
(void)name;
|
||||||
#endif
|
#endif
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LLamaModel::initializeGPUDevice(const LLModel::GPUDevice &device, std::string *unavail_reason)
|
bool LLamaModel::initializeGPUDevice(const LLModel::GPUDevice &device, std::string *unavail_reason)
|
||||||
{
|
{
|
||||||
bool result = false;
|
|
||||||
#if defined(GGML_USE_KOMPUTE)
|
#if defined(GGML_USE_KOMPUTE)
|
||||||
ggml_vk_device vkDevice;
|
(void)unavail_reason;
|
||||||
vkDevice.index = device.index;
|
d_ptr->device = device.index;
|
||||||
vkDevice.type = device.type;
|
return true;
|
||||||
vkDevice.heapSize = device.heapSize;
|
|
||||||
vkDevice.name = device.name;
|
|
||||||
vkDevice.vendor = device.vendor;
|
|
||||||
result = ggml_vk_init_device(vkDevice);
|
|
||||||
if (!result && unavail_reason) {
|
|
||||||
*unavail_reason = "failed to init GPU";
|
|
||||||
}
|
|
||||||
#else
|
#else
|
||||||
|
(void)device;
|
||||||
if (unavail_reason) {
|
if (unavail_reason) {
|
||||||
*unavail_reason = "built without Kompute";
|
*unavail_reason = "built without Kompute";
|
||||||
}
|
}
|
||||||
|
return false;
|
||||||
#endif
|
#endif
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LLamaModel::initializeGPUDevice(int device)
|
bool LLamaModel::initializeGPUDevice(int device)
|
||||||
{
|
{
|
||||||
#if defined(GGML_USE_KOMPUTE)
|
#if defined(GGML_USE_KOMPUTE)
|
||||||
return ggml_vk_init_device(device);
|
d_ptr->device = device;
|
||||||
|
return true;
|
||||||
#else
|
#else
|
||||||
|
(void)device;
|
||||||
return false;
|
return false;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -402,7 +401,7 @@ bool LLamaModel::initializeGPUDevice(int device)
|
|||||||
bool LLamaModel::hasGPUDevice()
|
bool LLamaModel::hasGPUDevice()
|
||||||
{
|
{
|
||||||
#if defined(GGML_USE_KOMPUTE)
|
#if defined(GGML_USE_KOMPUTE)
|
||||||
return ggml_vk_has_device();
|
return d_ptr->device != -1;
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
#endif
|
#endif
|
||||||
@ -411,11 +410,12 @@ bool LLamaModel::hasGPUDevice()
|
|||||||
bool LLamaModel::usingGPUDevice()
|
bool LLamaModel::usingGPUDevice()
|
||||||
{
|
{
|
||||||
#if defined(GGML_USE_KOMPUTE)
|
#if defined(GGML_USE_KOMPUTE)
|
||||||
return ggml_vk_using_vulkan();
|
return hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
|
||||||
#elif defined(GGML_USE_METAL)
|
#elif defined(GGML_USE_METAL)
|
||||||
return true;
|
return true;
|
||||||
#endif
|
#else
|
||||||
return false;
|
return false;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string get_arch_name(gguf_context *ctx_gguf) {
|
std::string get_arch_name(gguf_context *ctx_gguf) {
|
||||||
|
@ -26,7 +26,7 @@ public:
|
|||||||
void setThreadCount(int32_t n_threads) override;
|
void setThreadCount(int32_t n_threads) override;
|
||||||
int32_t threadCount() const override;
|
int32_t threadCount() const override;
|
||||||
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) override;
|
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) override;
|
||||||
bool initializeGPUDevice(size_t memoryRequired, const std::string& device) override;
|
bool initializeGPUDevice(size_t memoryRequired, const std::string& name) override;
|
||||||
bool initializeGPUDevice(const GPUDevice &device, std::string *unavail_reason) override;
|
bool initializeGPUDevice(const GPUDevice &device, std::string *unavail_reason) override;
|
||||||
bool initializeGPUDevice(int device) override;
|
bool initializeGPUDevice(int device) override;
|
||||||
bool hasGPUDevice() override;
|
bool hasGPUDevice() override;
|
||||||
|
@ -17,11 +17,14 @@ public:
|
|||||||
using Token = int32_t;
|
using Token = int32_t;
|
||||||
|
|
||||||
struct GPUDevice {
|
struct GPUDevice {
|
||||||
int index = 0;
|
int index;
|
||||||
int type = 0;
|
int type;
|
||||||
size_t heapSize = 0;
|
size_t heapSize;
|
||||||
std::string name;
|
std::string name;
|
||||||
std::string vendor;
|
std::string vendor;
|
||||||
|
|
||||||
|
GPUDevice(int index, int type, size_t heapSize, std::string name, std::string vendor):
|
||||||
|
index(index), type(type), heapSize(heapSize), name(std::move(name)), vendor(std::move(vendor)) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
class Implementation {
|
class Implementation {
|
||||||
@ -98,14 +101,25 @@ public:
|
|||||||
return *m_implementation;
|
return *m_implementation;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual std::vector<GPUDevice> availableGPUDevices(size_t /*memoryRequired*/) { return std::vector<GPUDevice>(); }
|
virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) {
|
||||||
virtual bool initializeGPUDevice(size_t /*memoryRequired*/, const std::string& /*device*/) { return false; }
|
(void)memoryRequired;
|
||||||
virtual bool initializeGPUDevice(const GPUDevice &/*device*/, std::string *unavail_reason = nullptr) {
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual bool initializeGPUDevice(size_t memoryRequired, const std::string& name) {
|
||||||
|
(void)memoryRequired;
|
||||||
|
(void)name;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual bool initializeGPUDevice(const GPUDevice & device, std::string *unavail_reason = nullptr) {
|
||||||
|
(void)device;
|
||||||
if (unavail_reason) {
|
if (unavail_reason) {
|
||||||
*unavail_reason = "model has no GPU support";
|
*unavail_reason = "model has no GPU support";
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool initializeGPUDevice(int /*device*/) { return false; }
|
virtual bool initializeGPUDevice(int /*device*/) { return false; }
|
||||||
virtual bool hasGPUDevice() { return false; }
|
virtual bool hasGPUDevice() { return false; }
|
||||||
virtual bool usingGPUDevice() { return false; }
|
virtual bool usingGPUDevice() { return false; }
|
||||||
|
@ -230,12 +230,13 @@ bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryReq
|
|||||||
|
|
||||||
bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device)
|
bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device)
|
||||||
{
|
{
|
||||||
LLModel::GPUDevice d;
|
LLModel::GPUDevice d(
|
||||||
d.index = device->index;
|
/* index = */ device->index,
|
||||||
d.type = device->type;
|
/* type = */ device->type,
|
||||||
d.heapSize = device->heapSize;
|
/* heapSize = */ device->heapSize,
|
||||||
d.name = device->name;
|
/* name = */ device->name,
|
||||||
d.vendor = device->vendor;
|
/* vendor = */ device->vendor
|
||||||
|
);
|
||||||
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||||
return wrapper->llModel->initializeGPUDevice(d);
|
return wrapper->llModel->initializeGPUDevice(d);
|
||||||
}
|
}
|
||||||
|
@ -4,50 +4,6 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <ggml.h>
|
#include <ggml.h>
|
||||||
|
|
||||||
#if defined(GGML_USE_KOMPUTE)
|
|
||||||
#include "ggml-kompute.h"
|
|
||||||
struct llm_buffer {
|
|
||||||
uint8_t * addr = NULL;
|
|
||||||
size_t size = 0;
|
|
||||||
ggml_vk_memory memory;
|
|
||||||
bool force_cpu = false;
|
|
||||||
|
|
||||||
llm_buffer() = default;
|
|
||||||
|
|
||||||
void resize(size_t size) {
|
|
||||||
free();
|
|
||||||
|
|
||||||
if (!ggml_vk_has_device() || force_cpu) {
|
|
||||||
this->addr = new uint8_t[size];
|
|
||||||
this->size = size;
|
|
||||||
} else {
|
|
||||||
this->memory = ggml_vk_allocate(size);
|
|
||||||
this->addr = (uint8_t*)memory.data;
|
|
||||||
this->size = size;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void free() {
|
|
||||||
if (!memory.primaryMemory) {
|
|
||||||
delete[] addr;
|
|
||||||
} else if (memory.data) {
|
|
||||||
ggml_vk_free_memory(memory);
|
|
||||||
}
|
|
||||||
this->addr = NULL;
|
|
||||||
this->size = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
~llm_buffer() {
|
|
||||||
free();
|
|
||||||
}
|
|
||||||
|
|
||||||
// disable copy and move
|
|
||||||
llm_buffer(const llm_buffer&) = delete;
|
|
||||||
llm_buffer(llm_buffer&&) = delete;
|
|
||||||
llm_buffer& operator=(const llm_buffer&) = delete;
|
|
||||||
llm_buffer& operator=(llm_buffer&&) = delete;
|
|
||||||
};
|
|
||||||
#else
|
|
||||||
struct llm_buffer {
|
struct llm_buffer {
|
||||||
uint8_t * addr = NULL;
|
uint8_t * addr = NULL;
|
||||||
size_t size = 0;
|
size_t size = 0;
|
||||||
@ -62,7 +18,6 @@ struct llm_buffer {
|
|||||||
delete[] addr;
|
delete[] addr;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
#endif
|
|
||||||
|
|
||||||
struct llm_kv_cache {
|
struct llm_kv_cache {
|
||||||
struct ggml_tensor * k;
|
struct ggml_tensor * k;
|
||||||
|
Loading…
Reference in New Issue
Block a user