From b19a3e5b2c273765bb75e9a9aa6d7c685a74dce0 Mon Sep 17 00:00:00 2001 From: Aaron Miller Date: Mon, 26 Jun 2023 12:17:34 -0700 Subject: [PATCH] add requiredMem method to llmodel impls most of these can just shortcut out of the model loading logic llama is a bit worse to deal with because we submodule it so I have to at least parse the hparams, and then I just use the size on disk as an estimate for the mem size (which seems reasonable since we mmap() the llama files anyway) --- gpt4all-backend/gptj.cpp | 27 +++++++++++++++- gpt4all-backend/gptj_impl.h | 1 + gpt4all-backend/llamamodel.cpp | 34 ++++++++++++++++++++ gpt4all-backend/llamamodel_impl.h | 1 + gpt4all-backend/llmodel.h | 1 + gpt4all-backend/llmodel_c.cpp | 6 ++++ gpt4all-backend/llmodel_c.h | 8 +++++ gpt4all-backend/mpt.cpp | 33 ++++++++++++++++--- gpt4all-backend/mpt_impl.h | 1 + gpt4all-backend/replit.cpp | 30 +++++++++++++++-- gpt4all-backend/replit_impl.h | 1 + gpt4all-bindings/python/gpt4all/pyllmodel.py | 12 +++++++ gpt4all-chat/chatgpt.cpp | 6 ++++ gpt4all-chat/chatgpt.h | 1 + 14 files changed, 154 insertions(+), 8 deletions(-) diff --git a/gpt4all-backend/gptj.cpp b/gpt4all-backend/gptj.cpp index 0aacd854..62925d9f 100644 --- a/gpt4all-backend/gptj.cpp +++ b/gpt4all-backend/gptj.cpp @@ -158,8 +158,11 @@ static bool kv_cache_init( } // load the model's weights from a stream -bool gptj_model_load(const std::string &fname, std::istream &fin, gptj_model & model, gpt_vocab & vocab) { +bool gptj_model_load(const std::string &fname, std::istream &fin, gptj_model & model, gpt_vocab & vocab, size_t * mem_req = nullptr) { printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); + if(mem_req != nullptr) { + *mem_req = 0; + } // verify magic { @@ -276,6 +279,19 @@ bool gptj_model_load(const std::string &fname, std::istream &fin, gptj_model & m printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); } + if (mem_req != nullptr) { + *mem_req += ctx_size; + const int n_embd = model.hparams.n_embd; + const int n_layer = model.hparams.n_layer; + + const int64_t n_mem = (int64_t)n_layer*model.hparams.n_ctx; + const int64_t n_elements = n_embd*n_mem; + + *mem_req += (2u*n_elements*ggml_type_size(wtype) + 2_MiB); + return false; + } + + // create the ggml context { struct ggml_init_params params = { @@ -837,6 +853,15 @@ GPTJ::GPTJ() d_ptr->modelLoaded = false; } +size_t GPTJ::requiredMem(const std::string &modelPath) { + gptj_model dummy_model; + gpt_vocab dummy_vocab; + size_t mem_req; + auto fin = std::ifstream(modelPath, std::ios::binary); + gptj_model_load(modelPath, fin, dummy_model, dummy_vocab, &mem_req); + return mem_req; +} + bool GPTJ::loadModel(const std::string &modelPath) { std::mt19937 rng(time(NULL)); d_ptr->rng = rng; diff --git a/gpt4all-backend/gptj_impl.h b/gpt4all-backend/gptj_impl.h index 4dda3ad5..93e27319 100644 --- a/gpt4all-backend/gptj_impl.h +++ b/gpt4all-backend/gptj_impl.h @@ -17,6 +17,7 @@ public: bool loadModel(const std::string &modelPath) override; bool isModelLoaded() const override; + size_t requiredMem(const std::string &modelPath) override; size_t stateSize() const override; size_t saveState(uint8_t *dest) const override; size_t restoreState(const uint8_t *src) override; diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp index 4cdfd359..93a899b5 100644 --- a/gpt4all-backend/llamamodel.cpp +++ b/gpt4all-backend/llamamodel.cpp @@ -97,6 +97,40 @@ LLamaModel::LLamaModel() d_ptr->modelLoaded = false; } +// default hparams (LLaMA 7B) +struct llama_file_hparams { + uint32_t n_vocab = 32000; + uint32_t n_embd = 4096; + uint32_t n_mult = 256; + uint32_t n_head = 32; + uint32_t n_layer = 32; + uint32_t n_rot = 64; + enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; +}; + +size_t LLamaModel::requiredMem(const std::string &modelPath) { + auto fin = std::ifstream(modelPath, std::ios::binary); + fin.seekg(0, std::ios_base::end); + size_t filesize = fin.tellg(); + fin.seekg(0, std::ios_base::beg); + uint32_t magic = 0; + fin.read(reinterpret_cast(&magic), sizeof(magic)); + if (magic != 0x67676a74) return 0; + uint32_t version = 0; + fin.read(reinterpret_cast(&version), sizeof(version)); + llama_file_hparams hparams; + fin.read(reinterpret_cast(&hparams.n_vocab), sizeof(hparams.n_vocab)); + fin.read(reinterpret_cast(&hparams.n_embd), sizeof(hparams.n_embd)); + fin.read(reinterpret_cast(&hparams.n_head), sizeof(hparams.n_head)); + fin.read(reinterpret_cast(&hparams.n_layer), sizeof(hparams.n_layer)); + fin.read(reinterpret_cast(&hparams.n_rot), sizeof(hparams.n_rot)); + fin.read(reinterpret_cast(&hparams.ftype), sizeof(hparams.ftype)); + const size_t n_ctx = 2048; + const size_t kvcache_element_size = 2; // fp16 + const size_t est_kvcache_size = hparams.n_embd * hparams.n_layer * 2u * n_ctx * kvcache_element_size; + return filesize + est_kvcache_size; +} + bool LLamaModel::loadModel(const std::string &modelPath) { // load the model diff --git a/gpt4all-backend/llamamodel_impl.h b/gpt4all-backend/llamamodel_impl.h index 10404576..7623f157 100644 --- a/gpt4all-backend/llamamodel_impl.h +++ b/gpt4all-backend/llamamodel_impl.h @@ -17,6 +17,7 @@ public: bool loadModel(const std::string &modelPath) override; bool isModelLoaded() const override; + size_t requiredMem(const std::string &modelPath) override; size_t stateSize() const override; size_t saveState(uint8_t *dest) const override; size_t restoreState(const uint8_t *src) override; diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h index ecd7d05b..8e3e5ea2 100644 --- a/gpt4all-backend/llmodel.h +++ b/gpt4all-backend/llmodel.h @@ -59,6 +59,7 @@ public: virtual bool loadModel(const std::string &modelPath) = 0; virtual bool isModelLoaded() const = 0; + virtual size_t requiredMem(const std::string &modelPath) = 0; virtual size_t stateSize() const { return 0; } virtual size_t saveState(uint8_t */*dest*/) const { return 0; } virtual size_t restoreState(const uint8_t */*src*/) { return 0; } diff --git a/gpt4all-backend/llmodel_c.cpp b/gpt4all-backend/llmodel_c.cpp index 883b87b0..78de7e9a 100644 --- a/gpt4all-backend/llmodel_c.cpp +++ b/gpt4all-backend/llmodel_c.cpp @@ -60,6 +60,12 @@ void llmodel_model_destroy(llmodel_model model) { delete reinterpret_cast(model); } +size_t llmodel_required_mem(llmodel_model model, const char *model_path) +{ + LLModelWrapper *wrapper = reinterpret_cast(model); + return wrapper->llModel->requiredMem(model_path); +} + bool llmodel_loadModel(llmodel_model model, const char *model_path) { LLModelWrapper *wrapper = reinterpret_cast(model); diff --git a/gpt4all-backend/llmodel_c.h b/gpt4all-backend/llmodel_c.h index c0d9249c..0d221c7e 100644 --- a/gpt4all-backend/llmodel_c.h +++ b/gpt4all-backend/llmodel_c.h @@ -107,6 +107,14 @@ llmodel_model llmodel_model_create2(const char *model_path, const char *build_va */ void llmodel_model_destroy(llmodel_model model); +/** + * Estimate RAM requirement for a model file + * @param model A pointer to the llmodel_model instance. + * @param model_path A string representing the path to the model file. + * @return size greater than 0 if the model was parsed successfully, 0 if file could not be parsed. + */ +size_t llmodel_required_mem(llmodel_model model, const char *model_path); + /** * Load a model from a file. * @param model A pointer to the llmodel_model instance. diff --git a/gpt4all-backend/mpt.cpp b/gpt4all-backend/mpt.cpp index 8c3cebf1..7912ac3a 100644 --- a/gpt4all-backend/mpt.cpp +++ b/gpt4all-backend/mpt.cpp @@ -152,9 +152,13 @@ static bool kv_cache_init( return true; } -// load the model's weights from a stream -bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & model, gpt_vocab & vocab) { +// load the model's weights from a stream. if mem_req ptr is passed the model is +// only partially parsed to estimate required memory +bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & model, gpt_vocab & vocab, size_t * mem_req) { printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); + if (mem_req != nullptr) { + *mem_req = 0; + } // verify magic { @@ -276,6 +280,18 @@ bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & mod printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); } + if (mem_req != nullptr) { + *mem_req += ctx_size; + const int n_embd = model.hparams.n_embd; + const int n_layer = model.hparams.n_layer; + + const int64_t n_mem = (int64_t)n_layer*model.hparams.n_ctx; + const int64_t n_elements = n_embd*n_mem; + + *mem_req += (2u*n_elements*ggml_type_size(wtype) + 2_MiB); + return false; + } + // create the ggml context { struct ggml_init_params params = { @@ -431,7 +447,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo return false; } - bool loaded = mpt_model_load(fname, fin, model, vocab); + bool loaded = mpt_model_load(fname, fin, model, vocab, nullptr); fin.close(); return loaded; } @@ -761,6 +777,15 @@ MPT::MPT() d_ptr->modelLoaded = false; } +size_t MPT::requiredMem(const std::string &modelPath) { + mpt_model dummy_model; + gpt_vocab dummy_vocab; + size_t mem_req; + auto fin = std::ifstream(modelPath, std::ios::binary); + mpt_model_load(modelPath, fin, dummy_model, dummy_vocab, &mem_req); + return mem_req; +} + bool MPT::loadModel(const std::string &modelPath) { std::mt19937 rng(time(NULL)); d_ptr->rng = rng; @@ -768,7 +793,7 @@ bool MPT::loadModel(const std::string &modelPath) { auto fin = std::ifstream(modelPath, std::ios::binary); // load the model - if (!mpt_model_load(modelPath, fin, *d_ptr->model, d_ptr->vocab)) { + if (!mpt_model_load(modelPath, fin, *d_ptr->model, d_ptr->vocab, nullptr)) { std::cerr << "MPT ERROR: failed to load model from " << modelPath; return false; } diff --git a/gpt4all-backend/mpt_impl.h b/gpt4all-backend/mpt_impl.h index ee0998c7..f5156836 100644 --- a/gpt4all-backend/mpt_impl.h +++ b/gpt4all-backend/mpt_impl.h @@ -17,6 +17,7 @@ public: bool loadModel(const std::string &modelPath) override; bool isModelLoaded() const override; + size_t requiredMem(const std::string &modelPath) override; size_t stateSize() const override; size_t saveState(uint8_t *dest) const override; size_t restoreState(const uint8_t *src) override; diff --git a/gpt4all-backend/replit.cpp b/gpt4all-backend/replit.cpp index df26aa44..a1c45abc 100644 --- a/gpt4all-backend/replit.cpp +++ b/gpt4all-backend/replit.cpp @@ -267,8 +267,11 @@ static bool kv_cache_init( } // load the model's weights from a stream -bool replit_model_load(const std::string & fname, std::istream &fin, replit_model & model, replit_tokenizer & vocab) { +bool replit_model_load(const std::string & fname, std::istream &fin, replit_model & model, replit_tokenizer & vocab, size_t *mem_req) { printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); + if (mem_req != nullptr) { + *mem_req = 0; + } // verify magic { @@ -352,6 +355,18 @@ bool replit_model_load(const std::string & fname, std::istream &fin, replit_mode printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size / (1024.0 * 1024.0)); } + if (mem_req != nullptr) { + *mem_req += ctx_size; + const int n_embd = model.hparams.n_embd; + const int n_layer = model.hparams.n_layer; + + const int64_t n_mem = (int64_t)n_layer*model.hparams.n_ctx; + const int64_t n_elements = n_embd*n_mem; + + *mem_req += (2u*n_elements*ggml_type_size(wtype) + 2_MiB); + return false; + } + // create the ggml context { struct ggml_init_params params = { @@ -544,7 +559,7 @@ bool replit_model_load(const std::string & fname, replit_model & model, replit_t return false; } - bool loaded = replit_model_load(fname, fin, model, vocab); + bool loaded = replit_model_load(fname, fin, model, vocab, nullptr); fin.close(); return loaded; } @@ -888,6 +903,15 @@ Replit::Replit() d_ptr->modelLoaded = false; } +size_t Replit::requiredMem(const std::string &modelPath) { + replit_model dummy_model; + replit_tokenizer dummy_vocab; + size_t mem_req; + auto fin = std::ifstream(modelPath, std::ios::binary); + replit_model_load(modelPath, fin, dummy_model, dummy_vocab, &mem_req); + return mem_req; +} + bool Replit::loadModel(const std::string &modelPath) { std::mt19937 rng(time(NULL)); d_ptr->rng = rng; @@ -895,7 +919,7 @@ bool Replit::loadModel(const std::string &modelPath) { auto fin = std::ifstream(modelPath, std::ios::binary); // load the model - if (!replit_model_load(modelPath, fin, *d_ptr->model, d_ptr->vocab)) { + if (!replit_model_load(modelPath, fin, *d_ptr->model, d_ptr->vocab, nullptr)) { std::cerr << "Replit ERROR: failed to load model from " << modelPath; return false; } diff --git a/gpt4all-backend/replit_impl.h b/gpt4all-backend/replit_impl.h index 0ff22aa4..73a8ea80 100644 --- a/gpt4all-backend/replit_impl.h +++ b/gpt4all-backend/replit_impl.h @@ -19,6 +19,7 @@ public: bool loadModel(const std::string &modelPath) override; bool isModelLoaded() const override; + size_t requiredMem(const std::string & modelPath) override; size_t stateSize() const override; size_t saveState(uint8_t *dest) const override; size_t restoreState(const uint8_t *src) override; diff --git a/gpt4all-bindings/python/gpt4all/pyllmodel.py b/gpt4all-bindings/python/gpt4all/pyllmodel.py index 820122c1..3795dc5d 100644 --- a/gpt4all-bindings/python/gpt4all/pyllmodel.py +++ b/gpt4all-bindings/python/gpt4all/pyllmodel.py @@ -81,6 +81,8 @@ llmodel.llmodel_model_destroy.restype = None llmodel.llmodel_loadModel.argtypes = [ctypes.c_void_p, ctypes.c_char_p] llmodel.llmodel_loadModel.restype = ctypes.c_bool +llmodel.llmodel_required_mem.argtypes = [ctypes.c_void_p, ctypes.c_char_p] +llmodel.llmodel_required_mem.restype = ctypes.c_size_t llmodel.llmodel_isModelLoaded.argtypes = [ctypes.c_void_p] llmodel.llmodel_isModelLoaded.restype = ctypes.c_bool @@ -131,6 +133,16 @@ class LLModel: if self.model is not None: llmodel.llmodel_model_destroy(self.model) + def memory_needed(self, model_path: str) -> int: + model_path_enc = model_path.encode("utf-8") + self.model = llmodel.llmodel_model_create(model_path_enc) + + if self.model is not None: + return llmodel.llmodel_required_mem(self.model, model_path_enc) + else: + raise ValueError("Unable to instantiate model") + + def load_model(self, model_path: str) -> bool: """ Load model from a file. diff --git a/gpt4all-chat/chatgpt.cpp b/gpt4all-chat/chatgpt.cpp index 64637907..5d378930 100644 --- a/gpt4all-chat/chatgpt.cpp +++ b/gpt4all-chat/chatgpt.cpp @@ -20,6 +20,12 @@ ChatGPT::ChatGPT() { } +size_t ChatGPT::requiredMem(const std::string &modelPath) +{ + Q_UNUSED(modelPath); + return 0; +} + bool ChatGPT::loadModel(const std::string &modelPath) { Q_UNUSED(modelPath); diff --git a/gpt4all-chat/chatgpt.h b/gpt4all-chat/chatgpt.h index 4c8a123d..af06a4bb 100644 --- a/gpt4all-chat/chatgpt.h +++ b/gpt4all-chat/chatgpt.h @@ -16,6 +16,7 @@ public: bool loadModel(const std::string &modelPath) override; bool isModelLoaded() const override; + size_t requiredMem(const std::string &modelPath) override; size_t stateSize() const override; size_t saveState(uint8_t *dest) const override; size_t restoreState(const uint8_t *src) override;