diff --git a/include/turbopilot/model.hpp b/include/turbopilot/model.hpp
index 2849d08..e5afca7 100644
--- a/include/turbopilot/model.hpp
+++ b/include/turbopilot/model.hpp
@@ -43,6 +43,7 @@ struct ModelConfig
     int32_t seed = -1;     // RNG seed
     int32_t n_ctx = 512;   // context size
     int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_gpu_layers = 0;
 };
 
 class TurbopilotModel
diff --git a/src/gptneox.cpp b/src/gptneox.cpp
index fc3fcf0..40558a9 100644
--- a/src/gptneox.cpp
+++ b/src/gptneox.cpp
@@ -3,6 +3,13 @@
 
 #include <ggml/ggml.h>
 
+#ifdef GGML_USE_CLBLAST
+#include "ggml-opencl.h"
+#endif
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
+
 #include <cinttypes>
 
 #include <iostream>
@@ -50,6 +57,7 @@ ggml_tensor * gpt_neox_ff(
 }
 
 
+
 // evaluate the transformer
 //
 //   - model:     the model
@@ -606,9 +614,43 @@ bool GPTNEOXModel::load_model(std::string fname) {
 
         printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
     }
-
     fin.close();
 
+    #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
+
+    printf("inside ggml clblast check\n");
+
+    if(config.n_gpu_layers > 0){
+        size_t vram_total = 0;
+        int gpu_layers = std::min(config.n_gpu_layers, model->hparams.n_layer);
+        spdlog::info("Attempting to offload %d layers to GPU", gpu_layers);
+
+
+        for(int i=0; i < gpu_layers; i++) {
+            const auto & layer = model->layers[i];
+            layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
+            layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
+            layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
+            layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
+
+            #if defined(GGML_USE_CLBLAST)
+            ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
+            ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
+            ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
+            ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
+            #else
+            ggml_cuda_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
+            ggml_cuda_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
+            ggml_cuda_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
+            ggml_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
+            #endif
+        }
+
+        fprintf(stderr, "%s: [GPU] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+    }
+    
+    #endif // defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
+
     return true;
 }
 
diff --git a/src/main.cpp b/src/main.cpp
index 32ddcf9..8a3fe2e 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -33,6 +33,12 @@ int main(int argc, char **argv)
         .scan<'i', int>();
 
 
+    program.add_argument("--ngl", "--n-gpu-layers")
+        .help("The number of layers to offload to GPU")
+        .default_value(0)
+        .scan<'i', int>();
+
+
     program.add_argument("-p", "--port")
         .help("The tcp port that turbopilot should listen on")
         .default_value(18080)
@@ -70,6 +76,7 @@ int main(int argc, char **argv)
     std::mt19937 rng(program.get<int>("--random-seed"));
 
     config.n_threads = program.get<int>("--threads");
+    config.n_gpu_layers = program.get<int>("--ngl");
 
     if(model_type.compare("codegen") == 0) {
         spdlog::info("Initializing GPT-J type model for '{}' model", model_type);