When regenerating erase the previous response and prompt from the context.

2024-10-01 01:06:10 -04:00 · 2023-04-14 20:34:42 -04:00 · 2023-04-14 20:34:42 -04:00 · f8005cff45
commit f8005cff45
parent aa836fa6d5
4 changed files with 13 additions and 5 deletions
--- a/gptj.cpp
+++ b/gptj.cpp
@ -707,9 +707,11 @@ void GPTJ::prompt(const std::string &prompt, std::function<bool(const std::strin
            std::cerr << "GPT-J ERROR: Failed to process prompt\n";
            return;
        }
-        // We pass a null string to see if the user has asked us to stop...
-        if (!response(""))
-            return;
+        // We pass a null string for each token to see if the user has asked us to stop...
+        size_t tokens = batch_end - i;
+        for (size_t t = 0; t < tokens; ++t)
+            if (!response(""))
+                return;
        ctx.n_past += batch.size();
        i = batch_end;
    }
--- a/gptj.h
+++ b/gptj.h
@ -15,8 +15,8 @@ public:
    bool loadModel(const std::string &modelPath, std::istream &fin) override;
    bool isModelLoaded() const override;
    void prompt(const std::string &prompt, std::function<bool(const std::string&)> response,
-        PromptContext &ctx, int32_t n_predict = 200, int32_t top_k = 40, float top_p = 0.9f,
-        float temp = 0.9f, int32_t n_batch = 9) override;
+        PromptContext &ctx, int32_t n_predict = 200, int32_t top_k = 50400, float top_p = 1.0f,
+        float temp = 0.0f, int32_t n_batch = 9) override;

 private:
    GPTJPrivate *d_ptr;
--- a/llm.cpp
+++ b/llm.cpp
@ -19,6 +19,7 @@ static LLModel::PromptContext s_ctx;
 LLMObject::LLMObject()
    : QObject{nullptr}
    , m_llmodel(new GPTJ)
+    , m_responseTokens(0)
 {
    moveToThread(&m_llmThread);
    connect(&m_llmThread, &QThread::started, this, &LLMObject::loadModel);
@ -64,6 +65,9 @@ bool LLMObject::isModelLoaded() const

 void LLMObject::resetResponse()
 {
+    s_ctx.n_past -= m_responseTokens;
+    s_ctx.logits.erase(s_ctx.logits.end() -= m_responseTokens, s_ctx.logits.end());
+    m_responseTokens = 0;
    m_response = std::string();
    emit responseChanged();
 }
@ -89,6 +93,7 @@ bool LLMObject::handleResponse(const std::string &response)
    printf("%s", response.c_str());
    fflush(stdout);
 #endif
+    ++m_responseTokens;
    if (!response.empty()) {
        m_response.append(response);
        emit responseChanged();
--- a/llm.h
+++ b/llm.h
@ -41,6 +41,7 @@ private:
 private:
    LLModel *m_llmodel;
    std::string m_response;
+    quint32 m_responseTokens;
    QString m_modelName;
    QThread m_llmThread;
    std::atomic<bool> m_stopGenerating;