Merge pull request #54 from ravenscroftj/feature/debug-timing-logs

Implemented debug log level and added timings to model outputs
2024-06-28 23:32:20 +00:00 · 2023-08-23 17:17:20 +01:00 · 2023-08-23 17:17:20 +01:00 · 11f385066a
commit 11f385066a
parent 6d90e5d870 6a72e32dab
4 changed files with 36 additions and 2 deletions
--- a/src/gptj.cpp
+++ b/src/gptj.cpp
@ -642,5 +642,9 @@ std::stringstream GPTJModel::predict(std::string prompt, int max_length, bool in
        }
    }

+    spdlog::debug("{}:   sample time = {:8.2f} ms\n", __func__, t_sample_us/1000.0f);
+    spdlog::debug("{}:  predict time = {:8.2f} ms / {:.2f} ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
+    
+
    return result;
 }
--- a/src/gptneox.cpp
+++ b/src/gptneox.cpp
@ -638,6 +638,12 @@ std::stringstream GPTNEOXModel::predict(std::string prompt, int max_length, bool

    gpt_neox_eval((*model), config.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);

+    const int64_t t_start_us = ggml_time_us();
+
+    int64_t t_prompt_us = 0;
+    
+    int64_t t_response_us = 0;
+
    for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) {
        // predict
        if (embd.size() > 0) {
@ -694,7 +700,8 @@ std::stringstream GPTNEOXModel::predict(std::string prompt, int max_length, bool
            }
            i += embd.size() - 1;
        }
-        
+
+

        // end of text token
        //if (embd.back() == 50256) {
@ -703,5 +710,10 @@ std::stringstream GPTNEOXModel::predict(std::string prompt, int max_length, bool
        }
    }

+    spdlog::debug("{}:   sample time = {:8.2f} ms\n", __func__, t_sample_us/1000.0f);
+    spdlog::debug("{}:  predict time = {:8.2f} ms / {:.2f} ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
+    
+    
+
    return result;
 }
--- a/src/main.cpp
+++ b/src/main.cpp
@ -14,10 +14,18 @@
 #include "turbopilot/gptneox.hpp"
 #include "turbopilot/server.hpp"

+
+#define TURBOPILOT_VERSION "1.1.0"
+
 int main(int argc, char **argv)
 {

-    argparse::ArgumentParser program("turbopilot");
+    argparse::ArgumentParser program("turbopilot", TURBOPILOT_VERSION);
+
+    program.add_argument("--debug")
+        .default_value(false)
+        .help("Output verbose logs and timings")
+        .implicit_value(true);

    program.add_argument("-f", "--model-file")
        .help("Path to the model that turbopilot should serve")
@ -56,6 +64,7 @@ int main(int argc, char **argv)
    program.add_argument("prompt").remaining();


+
    try
    {
        program.parse_args(argc, argv);
@ -67,6 +76,11 @@ int main(int argc, char **argv)
        return 1;
    }

+    if(program.get<bool>("--debug")){
+        spdlog::set_level(spdlog::level::level_enum::debug);
+        spdlog::debug("debug logging enabled");
+    }
+
    ggml_time_init();

    const int64_t t_main_start_us = ggml_time_us();
--- a/src/starcoder.cpp
+++ b/src/starcoder.cpp
@ -766,6 +766,10 @@ std::stringstream StarcoderModel::predict(std::string prompt, int max_length, bo
            break;
        }
    }
+    spdlog::debug("{}:   sample time = {:8.2f} ms\n", __func__, t_sample_us/1000.0f);
+    spdlog::debug("{}:  predict time = {:8.2f} ms / {:.2f} ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
+    
+    

    return result;
 }