diff --git a/gpt4all-backend/CMakeLists.txt b/gpt4all-backend/CMakeLists.txt index 385a691c..0aa64234 100644 --- a/gpt4all-backend/CMakeLists.txt +++ b/gpt4all-backend/CMakeLists.txt @@ -132,6 +132,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS) add_library(starcoder-${BUILD_VARIANT} SHARED starcoder.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h) + target_compile_definitions(starcoder-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999) prepare_target(starcoder llama-mainline) endif() endforeach() diff --git a/gpt4all-backend/starcoder.cpp b/gpt4all-backend/starcoder.cpp index 322405ea..1a0ef935 100644 --- a/gpt4all-backend/starcoder.cpp +++ b/gpt4all-backend/starcoder.cpp @@ -73,6 +73,7 @@ struct starcoder_model { llm_buffer eval_buf; llm_buffer scr0_buf; llm_buffer scr1_buf; + llm_buffer work_buf; }; static bool kv_cache_init( @@ -452,7 +453,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp // - embd_w: the predicted logits for the next token // bool starcoder_eval( - const starcoder_model & model, + starcoder_model & model, const int n_threads, const int n_past, const std::vector & embd_inp, @@ -477,7 +478,6 @@ bool starcoder_eval( struct ggml_context * ctx0 = ggml_init(eval_ctx_params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); @@ -730,7 +730,7 @@ bool starcoder_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + ggml_graph_compute_g4a(model.work_buf, &gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf);