mirror of
https://github.com/ravenscroftj/turbopilot.git
synced 2024-10-01 05:06:01 +00:00
increase scratch on starcoder
This commit is contained in:
parent
5f7155a314
commit
b2b4a1480f
@ -191,4 +191,6 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
|
|
||||||
free(model);
|
free(model);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -5,6 +5,12 @@
|
|||||||
#include <ggml/ggml.h>
|
#include <ggml/ggml.h>
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CLBLAST
|
||||||
|
#include "ggml-opencl.h"
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
#include "ggml-cuda.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
// evaluate the transformer
|
// evaluate the transformer
|
||||||
//
|
//
|
||||||
@ -36,10 +42,10 @@ bool starcoder_eval(
|
|||||||
|
|
||||||
// use 2 scratch buffers
|
// use 2 scratch buffers
|
||||||
// TODO: very hacky solution - reimplement in a more elegant way
|
// TODO: very hacky solution - reimplement in a more elegant way
|
||||||
static size_t scr0_size = 256u*1024*1024;
|
static size_t scr0_size = 512u*1024*1024;
|
||||||
static void * scr0 = malloc(scr0_size);
|
static void * scr0 = malloc(scr0_size);
|
||||||
|
|
||||||
static size_t scr1_size = 256u*1024*1024;
|
static size_t scr1_size = 512u*1024*1024;
|
||||||
static void * scr1 = malloc(scr1_size);
|
static void * scr1 = malloc(scr1_size);
|
||||||
|
|
||||||
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
||||||
@ -677,6 +683,43 @@ bool StarcoderModel::load_model(std::string fname) {
|
|||||||
|
|
||||||
fin.close();
|
fin.close();
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
|
||||||
|
|
||||||
|
printf("inside ggml clblast check\n");
|
||||||
|
|
||||||
|
if(config.n_gpu_layers > 0){
|
||||||
|
size_t vram_total = 0;
|
||||||
|
int gpu_layers = std::min(config.n_gpu_layers, model->hparams.n_layer);
|
||||||
|
spdlog::info("Attempting to offload %d layers to GPU", gpu_layers);
|
||||||
|
|
||||||
|
|
||||||
|
for(int i=0; i < gpu_layers; i++) {
|
||||||
|
const auto & layer = model->layers[i];
|
||||||
|
layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
|
||||||
|
|
||||||
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
|
||||||
|
ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
|
||||||
|
ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
|
||||||
|
ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
|
||||||
|
#else
|
||||||
|
ggml_cuda_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
|
||||||
|
ggml_cuda_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
|
||||||
|
ggml_cuda_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
|
||||||
|
ggml_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: [GPU] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
|
||||||
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user