diff --git a/include/turbopilot/model.hpp b/include/turbopilot/model.hpp index 2849d08..e5afca7 100644 --- a/include/turbopilot/model.hpp +++ b/include/turbopilot/model.hpp @@ -43,6 +43,7 @@ struct ModelConfig int32_t seed = -1; // RNG seed int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_gpu_layers = 0; }; class TurbopilotModel diff --git a/src/gptneox.cpp b/src/gptneox.cpp index fc3fcf0..40558a9 100644 --- a/src/gptneox.cpp +++ b/src/gptneox.cpp @@ -3,6 +3,13 @@ #include +#ifdef GGML_USE_CLBLAST +#include "ggml-opencl.h" +#endif +#ifdef GGML_USE_CUBLAS +#include "ggml-cuda.h" +#endif + #include #include @@ -50,6 +57,7 @@ ggml_tensor * gpt_neox_ff( } + // evaluate the transformer // // - model: the model @@ -606,9 +614,43 @@ bool GPTNEOXModel::load_model(std::string fname) { printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); } - fin.close(); + #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS) + + printf("inside ggml clblast check\n"); + + if(config.n_gpu_layers > 0){ + size_t vram_total = 0; + int gpu_layers = std::min(config.n_gpu_layers, model->hparams.n_layer); + spdlog::info("Attempting to offload %d layers to GPU", gpu_layers); + + + for(int i=0; i < gpu_layers; i++) { + const auto & layer = model->layers[i]; + layer.c_attn_attn_w->backend = GGML_BACKEND_GPU; + layer.c_attn_proj_w->backend = GGML_BACKEND_GPU; + layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU; + layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU; + + #if defined(GGML_USE_CLBLAST) + ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w); + ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); + ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); + ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); + #else + ggml_cuda_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w); + ggml_cuda_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); + ggml_cuda_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); + ggml_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); + #endif + } + + fprintf(stderr, "%s: [GPU] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + } + + #endif // defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS) + return true; } diff --git a/src/main.cpp b/src/main.cpp index 32ddcf9..8a3fe2e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -33,6 +33,12 @@ int main(int argc, char **argv) .scan<'i', int>(); + program.add_argument("--ngl", "--n-gpu-layers") + .help("The number of layers to offload to GPU") + .default_value(0) + .scan<'i', int>(); + + program.add_argument("-p", "--port") .help("The tcp port that turbopilot should listen on") .default_value(18080) @@ -70,6 +76,7 @@ int main(int argc, char **argv) std::mt19937 rng(program.get("--random-seed")); config.n_threads = program.get("--threads"); + config.n_gpu_layers = program.get("--ngl"); if(model_type.compare("codegen") == 0) { spdlog::info("Initializing GPT-J type model for '{}' model", model_type);