starcoder: use ggml_graph_plan

2024-10-01 01:06:10 -04:00 · 2023-07-27 16:56:47 -07:00 · 2023-07-27 16:56:47 -07:00 · 33c22be2aa
commit 33c22be2aa
parent 27a8b020c3
2 changed files with 4 additions and 3 deletions
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@ -132,6 +132,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)

        add_library(starcoder-${BUILD_VARIANT} SHARED
            starcoder.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
+        target_compile_definitions(starcoder-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
        prepare_target(starcoder llama-mainline)
    endif()
 endforeach()
--- a/gpt4all-backend/starcoder.cpp
+++ b/gpt4all-backend/starcoder.cpp
@ -73,6 +73,7 @@ struct starcoder_model {
    llm_buffer eval_buf;
    llm_buffer scr0_buf;
    llm_buffer scr1_buf;
+    llm_buffer work_buf;
 };

 static bool kv_cache_init(
@ -452,7 +453,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp
 //   - embd_w:    the predicted logits for the next token
 //
 bool starcoder_eval(
-        const starcoder_model & model,
+        starcoder_model & model,
        const int n_threads,
        const int n_past,
        const std::vector<gpt_vocab::id> & embd_inp,
@ -477,7 +478,6 @@ bool starcoder_eval(

    struct ggml_context * ctx0 = ggml_init(eval_ctx_params);
    struct ggml_cgraph gf = {};
-    gf.n_threads = n_threads;

    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
@ -730,7 +730,7 @@ bool starcoder_eval(

    // run the computation
    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute       (ctx0, &gf);
+    ggml_graph_compute_g4a(model.work_buf, &gf, n_threads);

    //if (n_past%100 == 0) {
    //    ggml_graph_print   (&gf);