server: fix implementation and output of completion endpoints

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2024-10-01 01:06:10 -04:00 · 2024-09-03 12:48:54 -04:00 · 2024-09-03 12:48:54 -04:00 · e55564debe
commit e55564debe
parent 3f2e1f22ee
6 changed files with 148 additions and 106 deletions
--- a/gpt4all-backend/include/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/include/gpt4all-backend/llmodel.h
@ -249,7 +249,8 @@ protected:
                      std::function<bool(int32_t, const std::string&)> responseCallback,
                      bool allowContextShift,
                      PromptContext &promptCtx,
-                      std::vector<Token> embd_inp);
+                      std::vector<Token> embd_inp,
+                      bool isResponse = false);
    void generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
                          bool allowContextShift,
                          PromptContext &promptCtx);
--- a/gpt4all-backend/src/llmodel_shared.cpp
+++ b/gpt4all-backend/src/llmodel_shared.cpp
@ -133,7 +133,7 @@ void LLModel::prompt(const std::string &prompt,
        generateResponse(responseCallback, allowContextShift, promptCtx);
    } else {
        embd_inp = tokenize(promptCtx, *fakeReply, false);
-        if (!decodePrompt(promptCallback, responseCallback, allowContextShift, promptCtx, embd_inp))
+        if (!decodePrompt(promptCallback, responseCallback, allowContextShift, promptCtx, embd_inp, true))
            return; // error
    }

@ -157,7 +157,8 @@ bool LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
                           std::function<bool(int32_t, const std::string&)> responseCallback,
                           bool allowContextShift,
                           PromptContext &promptCtx,
-                           std::vector<Token> embd_inp) {
+                           std::vector<Token> embd_inp,
+                           bool isResponse) {
    if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
        responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
        std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
@ -196,7 +197,9 @@ bool LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
        for (size_t t = 0; t < tokens; ++t) {
            promptCtx.tokens.push_back(batch.at(t));
            promptCtx.n_past += 1;
-            if (!promptCallback(batch.at(t)))
+            Token tok = batch.at(t);
+            bool res = isResponse ? responseCallback(tok, tokenToString(tok)) : promptCallback(tok);
+            if (!res)
                return false;
        }
        i = batch_end;
--- a/gpt4all-chat/src/chat.cpp
+++ b/gpt4all-chat/src/chat.cpp
@ -239,16 +239,17 @@ void Chat::newPromptResponsePair(const QString &prompt)
    resetResponseState();
    m_chatModel->updateCurrentResponse(m_chatModel->count() - 1, false);
    m_chatModel->appendPrompt("Prompt: ", prompt);
-    m_chatModel->appendResponse("Response: ", prompt);
+    m_chatModel->appendResponse("Response: ", QString());
    emit resetResponseRequested();
 }

+// the server needs to block until response is reset, so it calls resetResponse on its own m_llmThread
 void Chat::serverNewPromptResponsePair(const QString &prompt)
 {
    resetResponseState();
    m_chatModel->updateCurrentResponse(m_chatModel->count() - 1, false);
    m_chatModel->appendPrompt("Prompt: ", prompt);
-    m_chatModel->appendResponse("Response: ", prompt);
+    m_chatModel->appendResponse("Response: ", QString());
 }

 bool Chat::restoringFromText() const
--- a/gpt4all-chat/src/chatllm.cpp
+++ b/gpt4all-chat/src/chatllm.cpp
@ -626,16 +626,16 @@ void ChatLLM::regenerateResponse()
    m_ctx.tokens.erase(m_ctx.tokens.end() - m_promptResponseTokens, m_ctx.tokens.end());
    m_promptResponseTokens = 0;
    m_promptTokens = 0;
-    m_response = std::string();
-    emit responseChanged(QString::fromStdString(m_response));
+    m_response = m_trimmedResponse = std::string();
+    emit responseChanged(QString::fromStdString(m_trimmedResponse));
 }

 void ChatLLM::resetResponse()
 {
    m_promptTokens = 0;
    m_promptResponseTokens = 0;
-    m_response = std::string();
-    emit responseChanged(QString::fromStdString(m_response));
+    m_response = m_trimmedResponse = std::string();
+    emit responseChanged(QString::fromStdString(m_trimmedResponse));
 }

 void ChatLLM::resetContext()
@ -645,9 +645,12 @@ void ChatLLM::resetContext()
    m_ctx = LLModel::PromptContext();
 }

-QString ChatLLM::response() const
+QString ChatLLM::response(bool trim) const
 {
-    return QString::fromStdString(remove_leading_whitespace(m_response));
+    std::string resp = m_response;
+    if (trim)
+        resp = remove_leading_whitespace(resp);
+    return QString::fromStdString(resp);
 }

 ModelInfo ChatLLM::modelInfo() const
@ -705,7 +708,8 @@ bool ChatLLM::handleResponse(int32_t token, const std::string &response)
    // check for error
    if (token < 0) {
        m_response.append(response);
-        emit responseChanged(QString::fromStdString(remove_leading_whitespace(m_response)));
+        m_trimmedResponse = remove_leading_whitespace(m_response);
+        emit responseChanged(QString::fromStdString(m_trimmedResponse));
        return false;
    }

@ -715,7 +719,8 @@ bool ChatLLM::handleResponse(int32_t token, const std::string &response)
    m_timer->inc();
    Q_ASSERT(!response.empty());
    m_response.append(response);
-    emit responseChanged(QString::fromStdString(remove_leading_whitespace(m_response)));
+    m_trimmedResponse = remove_leading_whitespace(m_response);
+    emit responseChanged(QString::fromStdString(m_trimmedResponse));
    return !m_stopGenerating;
 }

@ -741,7 +746,7 @@ bool ChatLLM::prompt(const QList<QString> &collectionList, const QString &prompt

 bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString &prompt, const QString &promptTemplate,
    int32_t n_predict, int32_t top_k, float top_p, float min_p, float temp, int32_t n_batch, float repeat_penalty,
-    int32_t repeat_penalty_tokens)
+    int32_t repeat_penalty_tokens, std::optional<QString> fakeReply)
 {
    if (!isModelLoaded())
        return false;
@ -751,7 +756,7 @@ bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString

    QList<ResultInfo> databaseResults;
    const int retrievalSize = MySettings::globalInstance()->localDocsRetrievalSize();
-    if (!collectionList.isEmpty()) {
+    if (!fakeReply && !collectionList.isEmpty()) {
        emit requestRetrieveFromDB(collectionList, prompt, retrievalSize, &databaseResults); // blocks
        emit databaseResultsChanged(databaseResults);
    }
@ -796,8 +801,14 @@ bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString
                                    /*allowContextShift*/ true, m_ctx);
        m_ctx.n_predict = old_n_predict; // now we are ready for a response
    }
+    std::string fakeReplyStr;
+    std::string *fakeReplyP = nullptr;
+    if (fakeReply) {
+        fakeReplyStr = fakeReply->toStdString();
+        fakeReplyP = &fakeReplyStr;
+    }
    m_llModelInfo.model->prompt(prompt.toStdString(), promptTemplate.toStdString(), promptFunc, responseFunc,
-                                /*allowContextShift*/ true, m_ctx);
+                                /*allowContextShift*/ true, m_ctx, false, fakeReplyP);
 #if defined(DEBUG)
    printf("\n");
    fflush(stdout);
@ -805,9 +816,9 @@ bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString
    m_timer->stop();
    qint64 elapsed = totalTime.elapsed();
    std::string trimmed = trim_whitespace(m_response);
-    if (trimmed != m_response) {
-        m_response = trimmed;
-        emit responseChanged(QString::fromStdString(m_response));
+    if (trimmed != m_trimmedResponse) {
+        m_trimmedResponse = trimmed;
+        emit responseChanged(QString::fromStdString(m_trimmedResponse));
    }

    SuggestionMode mode = MySettings::globalInstance()->suggestionMode();
@ -1078,6 +1089,7 @@ bool ChatLLM::deserialize(QDataStream &stream, int version, bool deserializeKV,
    QString response;
    stream >> response;
    m_response = response.toStdString();
+    m_trimmedResponse = trim_whitespace(m_response);
    QString nameResponse;
    stream >> nameResponse;
    m_nameResponse = nameResponse.toStdString();
--- a/gpt4all-chat/src/chatllm.h
+++ b/gpt4all-chat/src/chatllm.h
@ -116,7 +116,7 @@ public:
    void setForceUnloadModel(bool b) { m_forceUnloadModel = b; }
    void setMarkedForDeletion(bool b) { m_markedForDeletion = b; }

-    QString response() const;
+    QString response(bool trim = true) const;

    ModelInfo modelInfo() const;
    void setModelInfo(const ModelInfo &info);
@ -198,7 +198,7 @@ Q_SIGNALS:
 protected:
    bool promptInternal(const QList<QString> &collectionList, const QString &prompt, const QString &promptTemplate,
        int32_t n_predict, int32_t top_k, float top_p, float min_p, float temp, int32_t n_batch, float repeat_penalty,
-        int32_t repeat_penalty_tokens);
+        int32_t repeat_penalty_tokens, std::optional<QString> fakeReply = {});
    bool handlePrompt(int32_t token);
    bool handleResponse(int32_t token, const std::string &response);
    bool handleNamePrompt(int32_t token);
@ -221,6 +221,7 @@ private:
    bool loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadProps);

    std::string m_response;
+    std::string m_trimmedResponse;
    std::string m_nameResponse;
    QString m_questionResponse;
    LLModelInfo m_llModelInfo;
--- a/gpt4all-chat/src/server.cpp
+++ b/gpt4all-chat/src/server.cpp
@ -111,7 +111,7 @@ static inline QJsonObject modelToJson(const ModelInfo &info)

    QJsonArray permissions;
    QJsonObject permissionObj;
-    permissionObj.insert("id", "foobarbaz");
+    permissionObj.insert("id", "placeholder");
    permissionObj.insert("object", "model_permission");
    permissionObj.insert("created", 0);
    permissionObj.insert("allow_create_engine", false);
@ -618,7 +618,7 @@ void Server::start()
    });

    connect(this, &Server::requestServerNewPromptResponsePair, m_chat,
-        &Chat::serverNewPromptResponsePair, Qt::BlockingQueuedConnection);
+        &Chat::newPromptResponsePair, Qt::BlockingQueuedConnection);
 }

 static auto makeError(auto &&...args) -> std::pair<QHttpServerResponse, std::optional<QJsonObject>>
@ -643,13 +643,14 @@ auto Server::handleCompletionRequest(const CompletionRequest &request)

    // adds prompt/response items to GUI
    emit requestServerNewPromptResponsePair(request.prompt); // blocks
+    resetResponse();

    // load the new model if necessary
    setShouldBeLoaded(true);

    if (modelInfo.filename().isEmpty()) {
        std::cerr << "ERROR: couldn't load default model " << request.model.toStdString() << std::endl;
-        return makeError(QHttpServerResponder::StatusCode::BadRequest);
+        return makeError(QHttpServerResponder::StatusCode::InternalServerError);
    }

    // NB: this resets the context, regardless of whether this model is already loaded
@ -658,10 +659,10 @@ auto Server::handleCompletionRequest(const CompletionRequest &request)
        return makeError(QHttpServerResponder::StatusCode::InternalServerError);
    }

-    const QString promptTemplate    = modelInfo.promptTemplate();
-    const float top_k               = modelInfo.topK();
+    // FIXME(jared): taking parameters from the UI inhibits reproducibility of results
+    const int  top_k          = modelInfo.topK();
    const int  n_batch        = modelInfo.promptBatchSize();
-    const float repeat_penalty      = modelInfo.repeatPenalty();
+    const auto repeat_penalty = float(modelInfo.repeatPenalty());
    const int  repeat_last_n  = modelInfo.repeatPenaltyTokens();

    int promptTokens = 0;
@ -671,7 +672,7 @@ auto Server::handleCompletionRequest(const CompletionRequest &request)
        if (!promptInternal(
            m_collections,
            request.prompt,
-            promptTemplate,
+            /*promptTemplate*/ u"%1"_s,
            request.max_tokens,
            top_k,
            request.top_p,
@ -684,22 +685,23 @@ auto Server::handleCompletionRequest(const CompletionRequest &request)
            std::cerr << "ERROR: couldn't prompt model " << modelInfo.name().toStdString() << std::endl;
            return makeError(QHttpServerResponder::StatusCode::InternalServerError);
        }
-        QString echoedPrompt = request.prompt;
-        if (!echoedPrompt.endsWith("\n"))
-            echoedPrompt += "\n";
-        responses.append(qMakePair((request.echo ? u"%1\n"_s.arg(request.prompt) : QString()) + response(), m_databaseResults));
+        QString resp = response(/*trim*/ false);
+        if (request.echo)
+            resp = request.prompt + resp;
+        responses.append({resp, m_databaseResults});
        if (!promptTokens)
-            promptTokens += m_promptTokens;
+            promptTokens = m_promptTokens;
        responseTokens += m_promptResponseTokens - m_promptTokens;
        if (i < request.n - 1)
            resetResponse();
    }

-    QJsonObject responseObject;
-    responseObject.insert("id", "foobarbaz");
-    responseObject.insert("object", "text_completion");
-    responseObject.insert("created", QDateTime::currentSecsSinceEpoch());
-    responseObject.insert("model", modelInfo.name());
+    QJsonObject responseObject {
+        { "id",      "placeholder"                      },
+        { "object",  "text_completion"                  },
+        { "created", QDateTime::currentSecsSinceEpoch() },
+        { "model",   modelInfo.name()                   },
+    };

    QJsonArray choices;
    {
@ -707,28 +709,28 @@ auto Server::handleCompletionRequest(const CompletionRequest &request)
        for (const auto &r : responses) {
            QString result = r.first;
            QList<ResultInfo> infos = r.second;
-            QJsonObject choice;
-            choice.insert("text", result);
-            choice.insert("index", index++);
-            choice.insert("logprobs", QJsonValue::Null); // We don't support
-            choice.insert("finish_reason", responseTokens == request.max_tokens ? "length" : "stop");
+            QJsonObject choice {
+                { "text",          result                                                   },
+                { "index",         index++                                                  },
+                { "logprobs",      QJsonValue::Null                                         },
+                { "finish_reason", responseTokens == request.max_tokens ? "length" : "stop" },
+            };
            if (MySettings::globalInstance()->localDocsShowReferences()) {
                QJsonArray references;
                for (const auto &ref : infos)
                    references.append(resultToJson(ref));
-                choice.insert("references", references);
+                choice.insert("references", references.isEmpty() ? QJsonValue::Null : QJsonValue(references));
            }
            choices.append(choice);
        }
    }

    responseObject.insert("choices", choices);
-
-    QJsonObject usage;
-    usage.insert("prompt_tokens", int(promptTokens));
-    usage.insert("completion_tokens", int(responseTokens));
-    usage.insert("total_tokens", int(promptTokens + responseTokens));
-    responseObject.insert("usage", usage);
+    responseObject.insert("usage", QJsonObject {
+        { "prompt_tokens",     promptTokens                  },
+        { "completion_tokens", responseTokens                },
+        { "total_tokens",      promptTokens + responseTokens },
+    });

    return {QHttpServerResponse(responseObject), responseObject};
 }
@ -748,32 +750,12 @@ auto Server::handleChatRequest(const ChatRequest &request)
        }
    }

-    // if we're a chat completion we have messages which means we need to use these as the prompt
-    QString actualPrompt = " ";
-    {
-        QList<QString> chats;
-        for (int i = 0; i < request.messages.count(); i++) {
-            auto &m = request.messages.at(i);
-            // FIXME: Deal with system messages correctly
-            if (m.role != ChatRequest::Message::Role::User)
-                continue;
-            QString content = m.content;
-            if (!content.endsWith("\n") && i < request.messages.count() - 1)
-                content += "\n";
-            chats.append(content);
-        }
-        actualPrompt.prepend(chats.join("\n"));
-    }
-
-    // adds prompt/response items to GUI
-    emit requestServerNewPromptResponsePair(actualPrompt); // blocks
-
    // load the new model if necessary
    setShouldBeLoaded(true);

    if (modelInfo.filename().isEmpty()) {
        std::cerr << "ERROR: couldn't load default model " << request.model.toStdString() << std::endl;
-        return makeError(QHttpServerResponder::StatusCode::BadRequest);
+        return makeError(QHttpServerResponder::StatusCode::InternalServerError);
    }

    // NB: this resets the context, regardless of whether this model is already loaded
@ -783,18 +765,30 @@ auto Server::handleChatRequest(const ChatRequest &request)
    }

    const QString promptTemplate = modelInfo.promptTemplate();
-    const float top_k               = modelInfo.topK();
+    const int     top_k          = modelInfo.topK();
    const int     n_batch        = modelInfo.promptBatchSize();
-    const float repeat_penalty      = modelInfo.repeatPenalty();
+    const auto    repeat_penalty = float(modelInfo.repeatPenalty());
    const int     repeat_last_n  = modelInfo.repeatPenaltyTokens();

    int promptTokens = 0;
    int responseTokens = 0;
    QList<QPair<QString, QList<ResultInfo>>> responses;
-    for (int i = 0; i < request.n; ++i) {
+    Q_ASSERT(!request.messages.isEmpty());
+    Q_ASSERT(request.messages.size() % 2 == 1);
+    for (int i = 0; i < request.messages.size() - 2; i += 2) {
+        using enum ChatRequest::Message::Role;
+        auto &user      = request.messages[i];
+        auto &assistant = request.messages[i + 1];
+        Q_ASSERT(user.role      == User);
+        Q_ASSERT(assistant.role == Assistant);
+
+        // adds prompt/response items to GUI
+        emit requestServerNewPromptResponsePair(user.content); // blocks
+        resetResponse();
+
        if (!promptInternal(
-            m_collections,
-            actualPrompt,
+            {},
+            user.content,
            promptTemplate,
            request.max_tokens,
            top_k,
@ -803,24 +797,52 @@ auto Server::handleChatRequest(const ChatRequest &request)
            request.temperature,
            n_batch,
            repeat_penalty,
-            repeat_last_n)) {
-
+            repeat_last_n,
+            assistant.content)
+        ) {
            std::cerr << "ERROR: couldn't prompt model " << modelInfo.name().toStdString() << std::endl;
            return makeError(QHttpServerResponder::StatusCode::InternalServerError);
        }
-        responses.append(qMakePair(response(), m_databaseResults));
-        if (!promptTokens)
+        promptTokens += m_promptResponseTokens; // previous responses are part of current prompt
+    }
+
+    QString lastMessage = request.messages.last().content;
+    // adds prompt/response items to GUI
+    emit requestServerNewPromptResponsePair(lastMessage); // blocks
+    resetResponse();
+
+    for (int i = 0; i < request.n; ++i) {
+        if (!promptInternal(
+            m_collections,
+            lastMessage,
+            promptTemplate,
+            request.max_tokens,
+            top_k,
+            request.top_p,
+            request.min_p,
+            request.temperature,
+            n_batch,
+            repeat_penalty,
+            repeat_last_n)
+        ) {
+            std::cerr << "ERROR: couldn't prompt model " << modelInfo.name().toStdString() << std::endl;
+            return makeError(QHttpServerResponder::StatusCode::InternalServerError);
+        }
+        responses.append({response(), m_databaseResults});
+        // FIXME(jared): these are UI counts and do not include framing tokens, which they should
+        if (i == 0)
            promptTokens += m_promptTokens;
        responseTokens += m_promptResponseTokens - m_promptTokens;
-        if (i < request.n - 1)
+        if (i != request.n - 1)
            resetResponse();
    }

-    QJsonObject responseObject;
-    responseObject.insert("id", "foobarbaz");
-    responseObject.insert("object", "text_completion");
-    responseObject.insert("created", QDateTime::currentSecsSinceEpoch());
-    responseObject.insert("model", modelInfo.name());
+    QJsonObject responseObject {
+        { "id",      "placeholder"                      },
+        { "object",  "chat.completion"                  },
+        { "created", QDateTime::currentSecsSinceEpoch() },
+        { "model",   modelInfo.name()                   },
+    };

    QJsonArray choices;
    {
@ -828,30 +850,32 @@ auto Server::handleChatRequest(const ChatRequest &request)
        for (const auto &r : responses) {
            QString result = r.first;
            QList<ResultInfo> infos = r.second;
-            QJsonObject choice;
-            choice.insert("index", index++);
-            choice.insert("finish_reason", responseTokens == request.max_tokens ? "length" : "stop");
-            QJsonObject message;
-            message.insert("role", "assistant");
-            message.insert("content", result);
-            choice.insert("message", message);
+            QJsonObject message {
+                { "role",    "assistant" },
+                { "content", result      },
+            };
+            QJsonObject choice {
+                { "index",         index++                                                  },
+                { "message",       message                                                  },
+                { "finish_reason", responseTokens == request.max_tokens ? "length" : "stop" },
+                { "logprobs",      QJsonValue::Null                                         },
+            };
            if (MySettings::globalInstance()->localDocsShowReferences()) {
                QJsonArray references;
                for (const auto &ref : infos)
                    references.append(resultToJson(ref));
-                choice.insert("references", references);
+                choice.insert("references", references.isEmpty() ? QJsonValue::Null : QJsonValue(references));
            }
            choices.append(choice);
        }
    }

    responseObject.insert("choices", choices);
-
-    QJsonObject usage;
-    usage.insert("prompt_tokens", int(promptTokens));
-    usage.insert("completion_tokens", int(responseTokens));
-    usage.insert("total_tokens", int(promptTokens + responseTokens));
-    responseObject.insert("usage", usage);
+    responseObject.insert("usage", QJsonObject {
+        { "prompt_tokens",     promptTokens                  },
+        { "completion_tokens", responseTokens                },
+        { "total_tokens",      promptTokens + responseTokens },
+    });

    return {QHttpServerResponse(responseObject), responseObject};
 }