diff --git a/include/turbopilot/gptj.hpp b/include/turbopilot/gptj.hpp index 9f1798b..c2acc39 100644 --- a/include/turbopilot/gptj.hpp +++ b/include/turbopilot/gptj.hpp @@ -71,7 +71,7 @@ public: } virtual ~GPTJModel(); bool load_model(std::string path); - virtual std::stringstream predict(std::string prompt, int max_length, bool include_prompt); + virtual std::stringstream predict_impl(std::string prompt, int max_length, bool include_prompt); private: gptj_model *model = NULL; diff --git a/include/turbopilot/gptneox.hpp b/include/turbopilot/gptneox.hpp index 66aeb95..78c3ed1 100644 --- a/include/turbopilot/gptneox.hpp +++ b/include/turbopilot/gptneox.hpp @@ -75,7 +75,7 @@ public: } virtual ~GPTNEOXModel(); bool load_model(std::string path); - virtual std::stringstream predict(std::string prompt, int max_length, bool include_prompt); + virtual std::stringstream predict_impl(std::string prompt, int max_length, bool include_prompt); private: gpt_neox_model *model = NULL; diff --git a/include/turbopilot/model.hpp b/include/turbopilot/model.hpp index 2849d08..3fcf28a 100644 --- a/include/turbopilot/model.hpp +++ b/include/turbopilot/model.hpp @@ -7,6 +7,7 @@ #include #include #include +#include typedef void (*offload_func_t)(struct ggml_tensor * tensor); void ggml_nop(struct ggml_tensor * tensor); @@ -54,11 +55,16 @@ public: rng(rng) {} virtual bool load_model(std::string model_path) = 0; - virtual std::stringstream predict(std::string prompt, int max_length, bool include_prompt) = 0; + std::stringstream predict(std::string prompt, int max_length, bool include_prompt); + void lock(); + void unlock(); + protected: + virtual std::stringstream predict_impl(std::string prompt, int max_length, bool include_prompt) = 0; ModelConfig config; std::mt19937 &rng; + boost::mutex model_lock; }; #endif //__TURBOPILOT_MODEL_H \ No newline at end of file diff --git a/include/turbopilot/server.hpp b/include/turbopilot/server.hpp index 7000544..18d30bd 100644 --- a/include/turbopilot/server.hpp +++ b/include/turbopilot/server.hpp @@ -2,6 +2,8 @@ #define __TURBOPILOT_SERVER_H +#include + #include "turbopilot/model.hpp" #include "crow_all.h" @@ -10,6 +12,46 @@ crow::response handle_openai_request(TurbopilotModel *model, const crow::request crow::response handle_hf_request(TurbopilotModel *model, const crow::request& req); +class TBPLogger : public crow::ILogHandler { + public: + TBPLogger() {} + void log(std::string message, crow::LogLevel crow_level) { + // "message" doesn't contain the timestamp and loglevel + // prefix the default logger does and it doesn't end + // in a newline. + + spdlog::level::level_enum level = spdlog::level::info; + + switch(crow_level){ + case crow::LogLevel::Critical: + level = spdlog::level::critical; + break; + + case crow::LogLevel::Error: + level = spdlog::level::err; + break; + + case crow::LogLevel::Warning: + level = spdlog::level::warn; + break; + + case crow::LogLevel::Info: + level = spdlog::level::info; + break; + + case crow::LogLevel::Debug: + level = spdlog::level::debug; + break; + + default: + // if case is not a known value, assume the worst + level = spdlog::level::critical; + } + + spdlog::log(level, message); + } +}; + #endif // __TURBOPILOT_SERVER_H diff --git a/include/turbopilot/starcoder.hpp b/include/turbopilot/starcoder.hpp index f5b7344..1ee94a0 100644 --- a/include/turbopilot/starcoder.hpp +++ b/include/turbopilot/starcoder.hpp @@ -68,7 +68,7 @@ public: } virtual ~StarcoderModel(); bool load_model(std::string path); - virtual std::stringstream predict(std::string prompt, int max_length, bool include_prompt); + virtual std::stringstream predict_impl(std::string prompt, int max_length, bool include_prompt); private: starcoder_model *model = NULL; diff --git a/src/common.cpp b/src/common.cpp index 2439c56..6435e48 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -4,6 +4,22 @@ #include #include + +void TurbopilotModel::lock(){ + this->model_lock.lock(); +} + +void TurbopilotModel::unlock(){ + this->model_lock.unlock(); +} + +std::stringstream TurbopilotModel::predict(std::string prompt, int max_length, bool include_prompt){ + lock(); + auto result = predict_impl(prompt, max_length, include_prompt); + unlock(); + return result; +} + void llama_nop(struct ggml_tensor * tensor) { // don't offload by default (void) tensor; } @@ -163,4 +179,6 @@ gpt_vocab::id gpt_sample_top_k_top_p( int idx = dist(rng); return logits_id[idx].second; -} \ No newline at end of file +} + + diff --git a/src/gptj.cpp b/src/gptj.cpp index f00e107..80aaeac 100644 --- a/src/gptj.cpp +++ b/src/gptj.cpp @@ -556,7 +556,7 @@ bool GPTJModel::load_model(std::string fname) { return true; } -std::stringstream GPTJModel::predict(std::string prompt, int max_length, bool include_prompt) { +std::stringstream GPTJModel::predict_impl(std::string prompt, int max_length, bool include_prompt) { std::stringstream result; // tokenize the prompt diff --git a/src/gptneox.cpp b/src/gptneox.cpp index 51665c7..fa7b08b 100644 --- a/src/gptneox.cpp +++ b/src/gptneox.cpp @@ -91,6 +91,7 @@ bool gpt_neox_eval( const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); + // reallocate buf_size = buf_size_new; buf = realloc(buf, buf_size); @@ -98,6 +99,8 @@ bool gpt_neox_eval( fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); return false; } + + spdlog::debug("{}: reallocating context buffer {} -> now {} bytes of tokens in prompt = {}", __func__, buf_size, buf_size_new); } struct ggml_init_params params = { @@ -283,6 +286,7 @@ bool gpt_neox_eval( // ggml_graph_print (&gf); // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); //} + //embd_w.resize(n_vocab*N); //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); @@ -293,7 +297,9 @@ bool gpt_neox_eval( if (mem_per_token == 0) { mem_per_token = ggml_used_mem(ctx0)/N; + } + spdlog::debug("used_mem = {}\n", ggml_used_mem(ctx0)); //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); ggml_free(ctx0); @@ -612,7 +618,7 @@ bool GPTNEOXModel::load_model(std::string fname) { return true; } -std::stringstream GPTNEOXModel::predict(std::string prompt, int max_length, bool include_prompt) { +std::stringstream GPTNEOXModel::predict_impl(std::string prompt, int max_length, bool include_prompt) { std::stringstream result; // tokenize the prompt @@ -631,6 +637,8 @@ std::stringstream GPTNEOXModel::predict(std::string prompt, int max_length, bool std::vector embd; + + // determine the required inference memory per token: size_t mem_per_token = 0; @@ -717,3 +725,4 @@ std::stringstream GPTNEOXModel::predict(std::string prompt, int max_length, bool return result; } + diff --git a/src/main.cpp b/src/main.cpp index c6ef84c..04c1fb3 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -121,12 +121,17 @@ int main(int argc, char **argv) } t_load_us = ggml_time_us() - t_start_us; + spdlog::info("Loaded model in {:0.2f}ms", t_load_us/1000.0f); crow::SimpleApp app; + TBPLogger logger; + + crow::logger::setHandler(&logger); + CROW_ROUTE(app, "/")([](){ return "Hello world"; }); @@ -168,5 +173,7 @@ int main(int argc, char **argv) app.port(program.get("--port")).multithreaded().run(); + + free(model); } \ No newline at end of file diff --git a/src/server.cpp b/src/server.cpp index b373137..80a4235 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -37,7 +37,6 @@ crow::response handle_hf_request(TurbopilotModel *model, const crow::request& re crow::json::wvalue response = { {"generated_text", result.str()}, }; - crow::response res; diff --git a/src/starcoder.cpp b/src/starcoder.cpp index c196f3c..5cbd425 100644 --- a/src/starcoder.cpp +++ b/src/starcoder.cpp @@ -44,13 +44,13 @@ bool starcoder_eval( if (mem_per_token > 0 && mem_per_token*N > buf_size) { const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead - //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); + spdlog::debug("{}: reallocating buffer from {} to {} bytes\n", __func__, buf_size, buf_size_new); // reallocate buf_size = buf_size_new; buf = realloc(buf, buf_size); if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + spdlog::error("{}: failed to allocate {} bytes\n", __func__, buf_size); return false; } } @@ -681,7 +681,7 @@ bool StarcoderModel::load_model(std::string fname) { } -std::stringstream StarcoderModel::predict(std::string prompt, int max_length, bool include_prompt) { +std::stringstream StarcoderModel::predict_impl(std::string prompt, int max_length, bool include_prompt) { std::stringstream result; // tokenize the prompt