mirror of
https://github.com/ravenscroftj/turbopilot.git
synced 2024-10-01 01:06:01 -04:00
add gpu offload for gptneox
This commit is contained in:
parent
8be7171573
commit
f818e2d09f
@ -43,6 +43,7 @@ struct ModelConfig
|
|||||||
int32_t seed = -1; // RNG seed
|
int32_t seed = -1; // RNG seed
|
||||||
int32_t n_ctx = 512; // context size
|
int32_t n_ctx = 512; // context size
|
||||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
|
int32_t n_gpu_layers = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
class TurbopilotModel
|
class TurbopilotModel
|
||||||
|
@ -3,6 +3,13 @@
|
|||||||
|
|
||||||
#include <ggml/ggml.h>
|
#include <ggml/ggml.h>
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CLBLAST
|
||||||
|
#include "ggml-opencl.h"
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
#include "ggml-cuda.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
@ -50,6 +57,7 @@ ggml_tensor * gpt_neox_ff(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// evaluate the transformer
|
// evaluate the transformer
|
||||||
//
|
//
|
||||||
// - model: the model
|
// - model: the model
|
||||||
@ -606,9 +614,43 @@ bool GPTNEOXModel::load_model(std::string fname) {
|
|||||||
|
|
||||||
printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
|
printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
|
||||||
}
|
}
|
||||||
|
|
||||||
fin.close();
|
fin.close();
|
||||||
|
|
||||||
|
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
|
||||||
|
|
||||||
|
printf("inside ggml clblast check\n");
|
||||||
|
|
||||||
|
if(config.n_gpu_layers > 0){
|
||||||
|
size_t vram_total = 0;
|
||||||
|
int gpu_layers = std::min(config.n_gpu_layers, model->hparams.n_layer);
|
||||||
|
spdlog::info("Attempting to offload %d layers to GPU", gpu_layers);
|
||||||
|
|
||||||
|
|
||||||
|
for(int i=0; i < gpu_layers; i++) {
|
||||||
|
const auto & layer = model->layers[i];
|
||||||
|
layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
|
||||||
|
|
||||||
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
|
||||||
|
ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
|
||||||
|
ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
|
||||||
|
ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
|
||||||
|
#else
|
||||||
|
ggml_cuda_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
|
||||||
|
ggml_cuda_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
|
||||||
|
ggml_cuda_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
|
||||||
|
ggml_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: [GPU] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -33,6 +33,12 @@ int main(int argc, char **argv)
|
|||||||
.scan<'i', int>();
|
.scan<'i', int>();
|
||||||
|
|
||||||
|
|
||||||
|
program.add_argument("--ngl", "--n-gpu-layers")
|
||||||
|
.help("The number of layers to offload to GPU")
|
||||||
|
.default_value(0)
|
||||||
|
.scan<'i', int>();
|
||||||
|
|
||||||
|
|
||||||
program.add_argument("-p", "--port")
|
program.add_argument("-p", "--port")
|
||||||
.help("The tcp port that turbopilot should listen on")
|
.help("The tcp port that turbopilot should listen on")
|
||||||
.default_value(18080)
|
.default_value(18080)
|
||||||
@ -70,6 +76,7 @@ int main(int argc, char **argv)
|
|||||||
std::mt19937 rng(program.get<int>("--random-seed"));
|
std::mt19937 rng(program.get<int>("--random-seed"));
|
||||||
|
|
||||||
config.n_threads = program.get<int>("--threads");
|
config.n_threads = program.get<int>("--threads");
|
||||||
|
config.n_gpu_layers = program.get<int>("--ngl");
|
||||||
|
|
||||||
if(model_type.compare("codegen") == 0) {
|
if(model_type.compare("codegen") == 0) {
|
||||||
spdlog::info("Initializing GPT-J type model for '{}' model", model_type);
|
spdlog::info("Initializing GPT-J type model for '{}' model", model_type);
|
||||||
|
Loading…
Reference in New Issue
Block a user