diff --git a/.github/workflows/build-commit.yml b/.github/workflows/build-commit.yml index ec59bd9..809823a 100644 --- a/.github/workflows/build-commit.yml +++ b/.github/workflows/build-commit.yml @@ -92,7 +92,7 @@ jobs: submodules: true - name: Install Dependencies - run: sudo apt-get update && sudo apt-get install -yq libboost-dev libasio-dev + run: sudo apt-get update && sudo apt-get install -yq libboost-dev libasio-dev libboost-thread-dev - name: Install OpenBlas if: ${{ matrix.build == 'avx2-openblas' }} @@ -207,6 +207,7 @@ jobs: $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim())) $lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe') & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll + - name: Build id: cmake_build env: @@ -214,7 +215,7 @@ jobs: run: | mkdir build cd build - cmake .. ${{ matrix.defines }} + cmake .. ${{ matrix.defines }} -DBoost_LIBRARYDIRS=${{ steps.install-boost.outputs.BOOST_ROOT }}/lib cmake --build . --config Release --target turbopilot # - name: Add libopenblas.dll diff --git a/CMakeLists.txt b/CMakeLists.txt index 22c174d..45a9fe7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,15 @@ cmake_minimum_required (VERSION 3.0) project(turbopilot VERSION 0.1.0) +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED true) +set(CMAKE_C_STANDARD 11) +set(CMAKE_C_STANDARD_REQUIRED true) +set(THREADS_PREFER_PTHREAD_FLAG ON) +find_package(Threads REQUIRED) + + +# option(BUILD_SHARED_LIBS "Build using shared libraries" OFF) + set(CMAKE_EXPORT_COMPILE_COMMANDS "on") set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) @@ -27,6 +37,9 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES endif() + + + add_subdirectory(extern/ggml) add_subdirectory(extern/argparse) add_subdirectory(extern/spdlog) @@ -35,8 +48,11 @@ if (GGML_STATIC) SET(CMAKE_FIND_LIBRARY_SUFFIXES ".a") SET(BUILD_SHARED_LIBS OFF) SET(CMAKE_EXE_LINKER_FLAGS "-static") -endif() + # if(GGML_OPENBLAS) + # set(BLA_STATIC ON) + # endif() +endif() add_subdirectory(src) diff --git a/Dockerfile.cuda11 b/Dockerfile.cuda11 index acbb7c2..bbeeff3 100644 --- a/Dockerfile.cuda11 +++ b/Dockerfile.cuda11 @@ -7,7 +7,7 @@ RUN apt-get update && apt-get install ca-certificates gpg wget RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null RUN echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' | tee /etc/apt/sources.list.d/kitware.list >/dev/null -RUN apt-get update && apt-get install -y build-essential cmake libboost-dev +RUN apt-get update && apt-get install -y build-essential cmake libboost-dev libboost-thread-dev ADD ./ /turbopilot diff --git a/Dockerfile.cuda12 b/Dockerfile.cuda12 index f72656b..ff27869 100644 --- a/Dockerfile.cuda12 +++ b/Dockerfile.cuda12 @@ -7,7 +7,7 @@ RUN apt-get update && apt-get install ca-certificates gpg wget RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null RUN echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ focal main' | tee /etc/apt/sources.list.d/kitware.list >/dev/null -RUN apt-get update && apt-get install -y build-essential cmake libboost-dev +RUN apt-get update && apt-get install -y build-essential cmake libboost-dev libboost-thread-dev ADD ./ /turbopilot diff --git a/Dockerfile.default b/Dockerfile.default index 1a26bd0..c0eaec9 100644 --- a/Dockerfile.default +++ b/Dockerfile.default @@ -1,6 +1,6 @@ FROM alpine AS build -RUN apk add --update alpine-sdk boost-dev cmake asio-dev +RUN apk add --update alpine-sdk boost-dev cmake asio-dev ADD ./ /turbopilot/ diff --git a/include/turbopilot/gptj.hpp b/include/turbopilot/gptj.hpp index 9f1798b..c2acc39 100644 --- a/include/turbopilot/gptj.hpp +++ b/include/turbopilot/gptj.hpp @@ -71,7 +71,7 @@ public: } virtual ~GPTJModel(); bool load_model(std::string path); - virtual std::stringstream predict(std::string prompt, int max_length, bool include_prompt); + virtual std::stringstream predict_impl(std::string prompt, int max_length, bool include_prompt); private: gptj_model *model = NULL; diff --git a/include/turbopilot/gptneox.hpp b/include/turbopilot/gptneox.hpp index 66aeb95..78c3ed1 100644 --- a/include/turbopilot/gptneox.hpp +++ b/include/turbopilot/gptneox.hpp @@ -75,7 +75,7 @@ public: } virtual ~GPTNEOXModel(); bool load_model(std::string path); - virtual std::stringstream predict(std::string prompt, int max_length, bool include_prompt); + virtual std::stringstream predict_impl(std::string prompt, int max_length, bool include_prompt); private: gpt_neox_model *model = NULL; diff --git a/include/turbopilot/model.hpp b/include/turbopilot/model.hpp index 2849d08..1dfe160 100644 --- a/include/turbopilot/model.hpp +++ b/include/turbopilot/model.hpp @@ -7,6 +7,7 @@ #include #include #include +#include typedef void (*offload_func_t)(struct ggml_tensor * tensor); void ggml_nop(struct ggml_tensor * tensor); @@ -54,11 +55,16 @@ public: rng(rng) {} virtual bool load_model(std::string model_path) = 0; - virtual std::stringstream predict(std::string prompt, int max_length, bool include_prompt) = 0; + std::stringstream predict(std::string prompt, int max_length, bool include_prompt); + void lock(); + void unlock(); + protected: + virtual std::stringstream predict_impl(std::string prompt, int max_length, bool include_prompt) = 0; ModelConfig config; std::mt19937 &rng; + std::mutex model_lock; }; #endif //__TURBOPILOT_MODEL_H \ No newline at end of file diff --git a/include/turbopilot/server.hpp b/include/turbopilot/server.hpp index 7000544..18d30bd 100644 --- a/include/turbopilot/server.hpp +++ b/include/turbopilot/server.hpp @@ -2,6 +2,8 @@ #define __TURBOPILOT_SERVER_H +#include + #include "turbopilot/model.hpp" #include "crow_all.h" @@ -10,6 +12,46 @@ crow::response handle_openai_request(TurbopilotModel *model, const crow::request crow::response handle_hf_request(TurbopilotModel *model, const crow::request& req); +class TBPLogger : public crow::ILogHandler { + public: + TBPLogger() {} + void log(std::string message, crow::LogLevel crow_level) { + // "message" doesn't contain the timestamp and loglevel + // prefix the default logger does and it doesn't end + // in a newline. + + spdlog::level::level_enum level = spdlog::level::info; + + switch(crow_level){ + case crow::LogLevel::Critical: + level = spdlog::level::critical; + break; + + case crow::LogLevel::Error: + level = spdlog::level::err; + break; + + case crow::LogLevel::Warning: + level = spdlog::level::warn; + break; + + case crow::LogLevel::Info: + level = spdlog::level::info; + break; + + case crow::LogLevel::Debug: + level = spdlog::level::debug; + break; + + default: + // if case is not a known value, assume the worst + level = spdlog::level::critical; + } + + spdlog::log(level, message); + } +}; + #endif // __TURBOPILOT_SERVER_H diff --git a/include/turbopilot/starcoder.hpp b/include/turbopilot/starcoder.hpp index f5b7344..1ee94a0 100644 --- a/include/turbopilot/starcoder.hpp +++ b/include/turbopilot/starcoder.hpp @@ -68,7 +68,7 @@ public: } virtual ~StarcoderModel(); bool load_model(std::string path); - virtual std::stringstream predict(std::string prompt, int max_length, bool include_prompt); + virtual std::stringstream predict_impl(std::string prompt, int max_length, bool include_prompt); private: starcoder_model *model = NULL; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 78f3618..d2c687e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,6 +1,7 @@ set(TURBOPILOT_TARGET turbopilot) -find_package(Boost REQUIRED) +find_package(Boost COMPONENTS thread system REQUIRED) + include_directories(${Boost_INCLUDE_DIRS}) add_executable(${TURBOPILOT_TARGET} @@ -16,6 +17,9 @@ add_executable(${TURBOPILOT_TARGET} ../include/turbopilot/starcoder.hpp ) +#set(THREADS_PREFER_PTHREAD_FLAG TRUE) +#find_package(Threads REQUIRED) + target_include_directories(${TURBOPILOT_TARGET} PRIVATE ../include @@ -23,8 +27,6 @@ target_include_directories(${TURBOPILOT_TARGET} PRIVATE ../extern/crow/include ) +#target_compile_features(${TURBOPILOT_TARGET} PRIVATE cxx_std_11) -target_link_libraries(${TURBOPILOT_TARGET} PRIVATE ggml argparse) - - -#target_link_libraries(${TURBOPILOT_TARGET} PRIVATE spdlog::spdlog_header_only) \ No newline at end of file +target_link_libraries(${TURBOPILOT_TARGET} PRIVATE ggml argparse) \ No newline at end of file diff --git a/src/common.cpp b/src/common.cpp index 2439c56..6435e48 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -4,6 +4,22 @@ #include #include + +void TurbopilotModel::lock(){ + this->model_lock.lock(); +} + +void TurbopilotModel::unlock(){ + this->model_lock.unlock(); +} + +std::stringstream TurbopilotModel::predict(std::string prompt, int max_length, bool include_prompt){ + lock(); + auto result = predict_impl(prompt, max_length, include_prompt); + unlock(); + return result; +} + void llama_nop(struct ggml_tensor * tensor) { // don't offload by default (void) tensor; } @@ -163,4 +179,6 @@ gpt_vocab::id gpt_sample_top_k_top_p( int idx = dist(rng); return logits_id[idx].second; -} \ No newline at end of file +} + + diff --git a/src/gptj.cpp b/src/gptj.cpp index f00e107..80aaeac 100644 --- a/src/gptj.cpp +++ b/src/gptj.cpp @@ -556,7 +556,7 @@ bool GPTJModel::load_model(std::string fname) { return true; } -std::stringstream GPTJModel::predict(std::string prompt, int max_length, bool include_prompt) { +std::stringstream GPTJModel::predict_impl(std::string prompt, int max_length, bool include_prompt) { std::stringstream result; // tokenize the prompt diff --git a/src/gptneox.cpp b/src/gptneox.cpp index 51665c7..fa7b08b 100644 --- a/src/gptneox.cpp +++ b/src/gptneox.cpp @@ -91,6 +91,7 @@ bool gpt_neox_eval( const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); + // reallocate buf_size = buf_size_new; buf = realloc(buf, buf_size); @@ -98,6 +99,8 @@ bool gpt_neox_eval( fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); return false; } + + spdlog::debug("{}: reallocating context buffer {} -> now {} bytes of tokens in prompt = {}", __func__, buf_size, buf_size_new); } struct ggml_init_params params = { @@ -283,6 +286,7 @@ bool gpt_neox_eval( // ggml_graph_print (&gf); // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); //} + //embd_w.resize(n_vocab*N); //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); @@ -293,7 +297,9 @@ bool gpt_neox_eval( if (mem_per_token == 0) { mem_per_token = ggml_used_mem(ctx0)/N; + } + spdlog::debug("used_mem = {}\n", ggml_used_mem(ctx0)); //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); ggml_free(ctx0); @@ -612,7 +618,7 @@ bool GPTNEOXModel::load_model(std::string fname) { return true; } -std::stringstream GPTNEOXModel::predict(std::string prompt, int max_length, bool include_prompt) { +std::stringstream GPTNEOXModel::predict_impl(std::string prompt, int max_length, bool include_prompt) { std::stringstream result; // tokenize the prompt @@ -631,6 +637,8 @@ std::stringstream GPTNEOXModel::predict(std::string prompt, int max_length, bool std::vector embd; + + // determine the required inference memory per token: size_t mem_per_token = 0; @@ -717,3 +725,4 @@ std::stringstream GPTNEOXModel::predict(std::string prompt, int max_length, bool return result; } + diff --git a/src/main.cpp b/src/main.cpp index 0b6c8e8..df78e97 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -127,12 +127,17 @@ int main(int argc, char **argv) } t_load_us = ggml_time_us() - t_start_us; + spdlog::info("Loaded model in {:0.2f}ms", t_load_us/1000.0f); crow::SimpleApp app; + TBPLogger logger; + + crow::logger::setHandler(&logger); + CROW_ROUTE(app, "/")([](){ return "Hello world"; }); @@ -175,5 +180,7 @@ int main(int argc, char **argv) app.port(program.get("--port")).multithreaded().run(); + + free(model); } \ No newline at end of file diff --git a/src/server.cpp b/src/server.cpp index b373137..80a4235 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -37,7 +37,6 @@ crow::response handle_hf_request(TurbopilotModel *model, const crow::request& re crow::json::wvalue response = { {"generated_text", result.str()}, }; - crow::response res; diff --git a/src/starcoder.cpp b/src/starcoder.cpp index c196f3c..5cbd425 100644 --- a/src/starcoder.cpp +++ b/src/starcoder.cpp @@ -44,13 +44,13 @@ bool starcoder_eval( if (mem_per_token > 0 && mem_per_token*N > buf_size) { const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead - //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); + spdlog::debug("{}: reallocating buffer from {} to {} bytes\n", __func__, buf_size, buf_size_new); // reallocate buf_size = buf_size_new; buf = realloc(buf, buf_size); if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + spdlog::error("{}: failed to allocate {} bytes\n", __func__, buf_size); return false; } } @@ -681,7 +681,7 @@ bool StarcoderModel::load_model(std::string fname) { } -std::stringstream StarcoderModel::predict(std::string prompt, int max_length, bool include_prompt) { +std::stringstream StarcoderModel::predict_impl(std::string prompt, int max_length, bool include_prompt) { std::stringstream result; // tokenize the prompt