From 1e14d91bc3b8bd2c586ef533f70e83fd4b473c71 Mon Sep 17 00:00:00 2001 From: James Ravenscroft Date: Mon, 21 Aug 2023 20:20:35 +0100 Subject: [PATCH] increase scratch on starcoder --- src/main.cpp | 4 +++- src/starcoder.cpp | 47 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index c26ea41..97d707d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -177,4 +177,6 @@ int main(int argc, char **argv) app.port(program.get("--port")).multithreaded().run(); free(model); -} \ No newline at end of file +} + + diff --git a/src/starcoder.cpp b/src/starcoder.cpp index c196f3c..f80cdd4 100644 --- a/src/starcoder.cpp +++ b/src/starcoder.cpp @@ -5,6 +5,12 @@ #include #include +#ifdef GGML_USE_CLBLAST +#include "ggml-opencl.h" +#endif +#ifdef GGML_USE_CUBLAS +#include "ggml-cuda.h" +#endif // evaluate the transformer // @@ -36,10 +42,10 @@ bool starcoder_eval( // use 2 scratch buffers // TODO: very hacky solution - reimplement in a more elegant way - static size_t scr0_size = 256u*1024*1024; + static size_t scr0_size = 512u*1024*1024; static void * scr0 = malloc(scr0_size); - static size_t scr1_size = 256u*1024*1024; + static size_t scr1_size = 512u*1024*1024; static void * scr1 = malloc(scr1_size); if (mem_per_token > 0 && mem_per_token*N > buf_size) { @@ -677,6 +683,43 @@ bool StarcoderModel::load_model(std::string fname) { fin.close(); + + #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS) + + printf("inside ggml clblast check\n"); + + if(config.n_gpu_layers > 0){ + size_t vram_total = 0; + int gpu_layers = std::min(config.n_gpu_layers, model->hparams.n_layer); + spdlog::info("Attempting to offload %d layers to GPU", gpu_layers); + + + for(int i=0; i < gpu_layers; i++) { + const auto & layer = model->layers[i]; + layer.c_attn_attn_w->backend = GGML_BACKEND_GPU; + layer.c_attn_proj_w->backend = GGML_BACKEND_GPU; + layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU; + layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU; + + #if defined(GGML_USE_CLBLAST) + ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w); + ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); + ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); + ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); + #else + ggml_cuda_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w); + ggml_cuda_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); + ggml_cuda_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); + ggml_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); + #endif + } + + fprintf(stderr, "%s: [GPU] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + } + + #endif // defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS) + + return true; }