diff --git a/gpt4all-backend/gptj.cpp b/gpt4all-backend/gptj.cpp index e7233a15..385171e1 100644 --- a/gpt4all-backend/gptj.cpp +++ b/gpt4all-backend/gptj.cpp @@ -5,6 +5,7 @@ #include "llmodel_shared.h" #include +#include #include #include #include @@ -85,7 +86,9 @@ struct gptj_model { struct ggml_context * ctx; std::map tensors; - llm_buffer buf; + llm_buffer eval_buf; + llm_buffer scr0_buf; + llm_buffer scr1_buf; ~gptj_model() { if (ctx) { @@ -393,7 +396,7 @@ bool gptj_model_load(const std::string &fname, std::istream &fin, gptj_model & m } if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lu, %lu], expected [%d, %d]\n", + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n", __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); return false; } @@ -438,6 +441,9 @@ bool gptj_model_load(const std::string &fname, std::istream &fin, gptj_model & m printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); } + model.scr0_buf.resize(256u * 1024 * 1024); + model.scr1_buf.resize(256u * 1024 * 1024); + return true; } @@ -484,24 +490,24 @@ bool gptj_eval( const int n_rot = hparams.n_rot; const size_t init_buf_size = 1024_MiB; - if (!model.buf.addr || model.buf.size < init_buf_size) - model.buf.resize(init_buf_size); + if (!model.eval_buf.addr || model.eval_buf.size < init_buf_size) + model.eval_buf.resize(init_buf_size); - if (mem_per_token > 0 && mem_per_token*N > model.buf.size) { + if (mem_per_token > 0 && mem_per_token*N > model.eval_buf.size) { const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead - printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, model.buf.size, buf_size_new); + printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, model.eval_buf.size, buf_size_new); // reallocate - model.buf.resize(buf_size_new); - if (model.buf.addr == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, model.buf.size); + model.eval_buf.resize(buf_size_new); + if (model.eval_buf.addr == nullptr) { + fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, model.eval_buf.size); return false; } } struct ggml_init_params params = { - .mem_size = model.buf.size, - .mem_buffer = model.buf.addr, + .mem_size = model.eval_buf.size, + .mem_buffer = model.eval_buf.addr, .no_alloc = false }; @@ -517,7 +523,7 @@ bool gptj_eval( for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * cur; - + ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, }); // norm { cur = ggml_norm(ctx0, inpL); @@ -612,6 +618,7 @@ bool gptj_eval( struct ggml_tensor * inpFF = cur; + ggml_set_scratch(ctx0, {0, model.scr1_buf.size, model.scr1_buf.addr, }); // feed-forward network // this is independent of the self-attention result, so it could be done in parallel to the self-attention { @@ -645,6 +652,8 @@ bool gptj_eval( inpL = ggml_add(ctx0, cur, inpL); } + ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, }); + // norm { inpL = ggml_norm(ctx0, inpL); @@ -657,6 +666,8 @@ bool gptj_eval( ggml_repeat(ctx0, model.ln_f_b, inpL)); } + ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + // lm_head { inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL); diff --git a/gpt4all-backend/mpt.cpp b/gpt4all-backend/mpt.cpp index ce019f3e..831a0487 100644 --- a/gpt4all-backend/mpt.cpp +++ b/gpt4all-backend/mpt.cpp @@ -5,6 +5,7 @@ #include "llmodel_shared.h" #include +#include #include #include #include @@ -80,7 +81,9 @@ struct mpt_model { std::map tensors; - llm_buffer buf; + llm_buffer eval_buf; + llm_buffer scr0_buf; + llm_buffer scr1_buf; ~mpt_model() { if (ctx) { @@ -370,8 +373,8 @@ bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & mod } if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", - __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n", + __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); return false; } @@ -403,6 +406,9 @@ bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & mod printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); } + model.scr0_buf.resize(256u * 1024 * 1024); + model.scr1_buf.resize(256u * 1024 * 1024); + return true; } @@ -438,24 +444,24 @@ bool mpt_eval( const int n_vocab = hparams.n_vocab; const size_t init_buf_size = 1024_MiB; - if (!model.buf.addr || model.buf.size < init_buf_size) - model.buf.resize(init_buf_size); + if (!model.eval_buf.addr || model.eval_buf.size < init_buf_size) + model.eval_buf.resize(init_buf_size); - if (mem_per_token > 0 && mem_per_token*N > model.buf.size) { + if (mem_per_token > 0 && mem_per_token*N > model.eval_buf.size) { const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead // printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, model.buf.size, buf_size_new); // reallocate - model.buf.resize(buf_size_new); - if (model.buf.addr == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, model.buf.size); + model.eval_buf.resize(buf_size_new); + if (model.eval_buf.addr == nullptr) { + fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, model.eval_buf.size); return false; } } struct ggml_init_params params = { - .mem_size = model.buf.size, - .mem_buffer = model.buf.addr, + .mem_size = model.eval_buf.size, + .mem_buffer = model.eval_buf.addr, .no_alloc = false }; @@ -470,6 +476,7 @@ bool mpt_eval( struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd); for (int il = 0; il < n_layer; ++il) { + ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, }); struct ggml_tensor * inpSA = inpL; struct ggml_tensor * cur = inpSA; @@ -561,7 +568,7 @@ bool mpt_eval( cur); } - + ggml_set_scratch(ctx0, {0, model.scr1_buf.size, model.scr1_buf.addr, }); // residual struct ggml_tensor * resSA = ggml_add(ctx0, cur, inpSA); // feed-forward network @@ -586,6 +593,7 @@ bool mpt_eval( // self-attention + FF inpL = ggml_add(ctx0, cur, resSA); } + ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, }); struct ggml_tensor * out = inpL; // -> logits @@ -594,6 +602,7 @@ bool mpt_eval( out = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_w, out), out); + ggml_set_scratch(ctx0, { 0, 0, nullptr, }); out = ggml_mul_mat(ctx0, model.wte, out); }