mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2024-10-01 01:06:10 -04:00
server: fix implementation and output of completion endpoints
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
parent
3f2e1f22ee
commit
e55564debe
@ -249,7 +249,8 @@ protected:
|
||||
std::function<bool(int32_t, const std::string&)> responseCallback,
|
||||
bool allowContextShift,
|
||||
PromptContext &promptCtx,
|
||||
std::vector<Token> embd_inp);
|
||||
std::vector<Token> embd_inp,
|
||||
bool isResponse = false);
|
||||
void generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
|
||||
bool allowContextShift,
|
||||
PromptContext &promptCtx);
|
||||
|
@ -133,7 +133,7 @@ void LLModel::prompt(const std::string &prompt,
|
||||
generateResponse(responseCallback, allowContextShift, promptCtx);
|
||||
} else {
|
||||
embd_inp = tokenize(promptCtx, *fakeReply, false);
|
||||
if (!decodePrompt(promptCallback, responseCallback, allowContextShift, promptCtx, embd_inp))
|
||||
if (!decodePrompt(promptCallback, responseCallback, allowContextShift, promptCtx, embd_inp, true))
|
||||
return; // error
|
||||
}
|
||||
|
||||
@ -157,7 +157,8 @@ bool LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
|
||||
std::function<bool(int32_t, const std::string&)> responseCallback,
|
||||
bool allowContextShift,
|
||||
PromptContext &promptCtx,
|
||||
std::vector<Token> embd_inp) {
|
||||
std::vector<Token> embd_inp,
|
||||
bool isResponse) {
|
||||
if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
|
||||
responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
|
||||
std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
|
||||
@ -196,7 +197,9 @@ bool LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
|
||||
for (size_t t = 0; t < tokens; ++t) {
|
||||
promptCtx.tokens.push_back(batch.at(t));
|
||||
promptCtx.n_past += 1;
|
||||
if (!promptCallback(batch.at(t)))
|
||||
Token tok = batch.at(t);
|
||||
bool res = isResponse ? responseCallback(tok, tokenToString(tok)) : promptCallback(tok);
|
||||
if (!res)
|
||||
return false;
|
||||
}
|
||||
i = batch_end;
|
||||
|
@ -239,16 +239,17 @@ void Chat::newPromptResponsePair(const QString &prompt)
|
||||
resetResponseState();
|
||||
m_chatModel->updateCurrentResponse(m_chatModel->count() - 1, false);
|
||||
m_chatModel->appendPrompt("Prompt: ", prompt);
|
||||
m_chatModel->appendResponse("Response: ", prompt);
|
||||
m_chatModel->appendResponse("Response: ", QString());
|
||||
emit resetResponseRequested();
|
||||
}
|
||||
|
||||
// the server needs to block until response is reset, so it calls resetResponse on its own m_llmThread
|
||||
void Chat::serverNewPromptResponsePair(const QString &prompt)
|
||||
{
|
||||
resetResponseState();
|
||||
m_chatModel->updateCurrentResponse(m_chatModel->count() - 1, false);
|
||||
m_chatModel->appendPrompt("Prompt: ", prompt);
|
||||
m_chatModel->appendResponse("Response: ", prompt);
|
||||
m_chatModel->appendResponse("Response: ", QString());
|
||||
}
|
||||
|
||||
bool Chat::restoringFromText() const
|
||||
|
@ -626,16 +626,16 @@ void ChatLLM::regenerateResponse()
|
||||
m_ctx.tokens.erase(m_ctx.tokens.end() - m_promptResponseTokens, m_ctx.tokens.end());
|
||||
m_promptResponseTokens = 0;
|
||||
m_promptTokens = 0;
|
||||
m_response = std::string();
|
||||
emit responseChanged(QString::fromStdString(m_response));
|
||||
m_response = m_trimmedResponse = std::string();
|
||||
emit responseChanged(QString::fromStdString(m_trimmedResponse));
|
||||
}
|
||||
|
||||
void ChatLLM::resetResponse()
|
||||
{
|
||||
m_promptTokens = 0;
|
||||
m_promptResponseTokens = 0;
|
||||
m_response = std::string();
|
||||
emit responseChanged(QString::fromStdString(m_response));
|
||||
m_response = m_trimmedResponse = std::string();
|
||||
emit responseChanged(QString::fromStdString(m_trimmedResponse));
|
||||
}
|
||||
|
||||
void ChatLLM::resetContext()
|
||||
@ -645,9 +645,12 @@ void ChatLLM::resetContext()
|
||||
m_ctx = LLModel::PromptContext();
|
||||
}
|
||||
|
||||
QString ChatLLM::response() const
|
||||
QString ChatLLM::response(bool trim) const
|
||||
{
|
||||
return QString::fromStdString(remove_leading_whitespace(m_response));
|
||||
std::string resp = m_response;
|
||||
if (trim)
|
||||
resp = remove_leading_whitespace(resp);
|
||||
return QString::fromStdString(resp);
|
||||
}
|
||||
|
||||
ModelInfo ChatLLM::modelInfo() const
|
||||
@ -705,7 +708,8 @@ bool ChatLLM::handleResponse(int32_t token, const std::string &response)
|
||||
// check for error
|
||||
if (token < 0) {
|
||||
m_response.append(response);
|
||||
emit responseChanged(QString::fromStdString(remove_leading_whitespace(m_response)));
|
||||
m_trimmedResponse = remove_leading_whitespace(m_response);
|
||||
emit responseChanged(QString::fromStdString(m_trimmedResponse));
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -715,7 +719,8 @@ bool ChatLLM::handleResponse(int32_t token, const std::string &response)
|
||||
m_timer->inc();
|
||||
Q_ASSERT(!response.empty());
|
||||
m_response.append(response);
|
||||
emit responseChanged(QString::fromStdString(remove_leading_whitespace(m_response)));
|
||||
m_trimmedResponse = remove_leading_whitespace(m_response);
|
||||
emit responseChanged(QString::fromStdString(m_trimmedResponse));
|
||||
return !m_stopGenerating;
|
||||
}
|
||||
|
||||
@ -741,7 +746,7 @@ bool ChatLLM::prompt(const QList<QString> &collectionList, const QString &prompt
|
||||
|
||||
bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString &prompt, const QString &promptTemplate,
|
||||
int32_t n_predict, int32_t top_k, float top_p, float min_p, float temp, int32_t n_batch, float repeat_penalty,
|
||||
int32_t repeat_penalty_tokens)
|
||||
int32_t repeat_penalty_tokens, std::optional<QString> fakeReply)
|
||||
{
|
||||
if (!isModelLoaded())
|
||||
return false;
|
||||
@ -751,7 +756,7 @@ bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString
|
||||
|
||||
QList<ResultInfo> databaseResults;
|
||||
const int retrievalSize = MySettings::globalInstance()->localDocsRetrievalSize();
|
||||
if (!collectionList.isEmpty()) {
|
||||
if (!fakeReply && !collectionList.isEmpty()) {
|
||||
emit requestRetrieveFromDB(collectionList, prompt, retrievalSize, &databaseResults); // blocks
|
||||
emit databaseResultsChanged(databaseResults);
|
||||
}
|
||||
@ -796,8 +801,14 @@ bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString
|
||||
/*allowContextShift*/ true, m_ctx);
|
||||
m_ctx.n_predict = old_n_predict; // now we are ready for a response
|
||||
}
|
||||
std::string fakeReplyStr;
|
||||
std::string *fakeReplyP = nullptr;
|
||||
if (fakeReply) {
|
||||
fakeReplyStr = fakeReply->toStdString();
|
||||
fakeReplyP = &fakeReplyStr;
|
||||
}
|
||||
m_llModelInfo.model->prompt(prompt.toStdString(), promptTemplate.toStdString(), promptFunc, responseFunc,
|
||||
/*allowContextShift*/ true, m_ctx);
|
||||
/*allowContextShift*/ true, m_ctx, false, fakeReplyP);
|
||||
#if defined(DEBUG)
|
||||
printf("\n");
|
||||
fflush(stdout);
|
||||
@ -805,9 +816,9 @@ bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString
|
||||
m_timer->stop();
|
||||
qint64 elapsed = totalTime.elapsed();
|
||||
std::string trimmed = trim_whitespace(m_response);
|
||||
if (trimmed != m_response) {
|
||||
m_response = trimmed;
|
||||
emit responseChanged(QString::fromStdString(m_response));
|
||||
if (trimmed != m_trimmedResponse) {
|
||||
m_trimmedResponse = trimmed;
|
||||
emit responseChanged(QString::fromStdString(m_trimmedResponse));
|
||||
}
|
||||
|
||||
SuggestionMode mode = MySettings::globalInstance()->suggestionMode();
|
||||
@ -1078,6 +1089,7 @@ bool ChatLLM::deserialize(QDataStream &stream, int version, bool deserializeKV,
|
||||
QString response;
|
||||
stream >> response;
|
||||
m_response = response.toStdString();
|
||||
m_trimmedResponse = trim_whitespace(m_response);
|
||||
QString nameResponse;
|
||||
stream >> nameResponse;
|
||||
m_nameResponse = nameResponse.toStdString();
|
||||
|
@ -116,7 +116,7 @@ public:
|
||||
void setForceUnloadModel(bool b) { m_forceUnloadModel = b; }
|
||||
void setMarkedForDeletion(bool b) { m_markedForDeletion = b; }
|
||||
|
||||
QString response() const;
|
||||
QString response(bool trim = true) const;
|
||||
|
||||
ModelInfo modelInfo() const;
|
||||
void setModelInfo(const ModelInfo &info);
|
||||
@ -198,7 +198,7 @@ Q_SIGNALS:
|
||||
protected:
|
||||
bool promptInternal(const QList<QString> &collectionList, const QString &prompt, const QString &promptTemplate,
|
||||
int32_t n_predict, int32_t top_k, float top_p, float min_p, float temp, int32_t n_batch, float repeat_penalty,
|
||||
int32_t repeat_penalty_tokens);
|
||||
int32_t repeat_penalty_tokens, std::optional<QString> fakeReply = {});
|
||||
bool handlePrompt(int32_t token);
|
||||
bool handleResponse(int32_t token, const std::string &response);
|
||||
bool handleNamePrompt(int32_t token);
|
||||
@ -221,6 +221,7 @@ private:
|
||||
bool loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadProps);
|
||||
|
||||
std::string m_response;
|
||||
std::string m_trimmedResponse;
|
||||
std::string m_nameResponse;
|
||||
QString m_questionResponse;
|
||||
LLModelInfo m_llModelInfo;
|
||||
|
@ -111,7 +111,7 @@ static inline QJsonObject modelToJson(const ModelInfo &info)
|
||||
|
||||
QJsonArray permissions;
|
||||
QJsonObject permissionObj;
|
||||
permissionObj.insert("id", "foobarbaz");
|
||||
permissionObj.insert("id", "placeholder");
|
||||
permissionObj.insert("object", "model_permission");
|
||||
permissionObj.insert("created", 0);
|
||||
permissionObj.insert("allow_create_engine", false);
|
||||
@ -618,7 +618,7 @@ void Server::start()
|
||||
});
|
||||
|
||||
connect(this, &Server::requestServerNewPromptResponsePair, m_chat,
|
||||
&Chat::serverNewPromptResponsePair, Qt::BlockingQueuedConnection);
|
||||
&Chat::newPromptResponsePair, Qt::BlockingQueuedConnection);
|
||||
}
|
||||
|
||||
static auto makeError(auto &&...args) -> std::pair<QHttpServerResponse, std::optional<QJsonObject>>
|
||||
@ -643,13 +643,14 @@ auto Server::handleCompletionRequest(const CompletionRequest &request)
|
||||
|
||||
// adds prompt/response items to GUI
|
||||
emit requestServerNewPromptResponsePair(request.prompt); // blocks
|
||||
resetResponse();
|
||||
|
||||
// load the new model if necessary
|
||||
setShouldBeLoaded(true);
|
||||
|
||||
if (modelInfo.filename().isEmpty()) {
|
||||
std::cerr << "ERROR: couldn't load default model " << request.model.toStdString() << std::endl;
|
||||
return makeError(QHttpServerResponder::StatusCode::BadRequest);
|
||||
return makeError(QHttpServerResponder::StatusCode::InternalServerError);
|
||||
}
|
||||
|
||||
// NB: this resets the context, regardless of whether this model is already loaded
|
||||
@ -658,10 +659,10 @@ auto Server::handleCompletionRequest(const CompletionRequest &request)
|
||||
return makeError(QHttpServerResponder::StatusCode::InternalServerError);
|
||||
}
|
||||
|
||||
const QString promptTemplate = modelInfo.promptTemplate();
|
||||
const float top_k = modelInfo.topK();
|
||||
// FIXME(jared): taking parameters from the UI inhibits reproducibility of results
|
||||
const int top_k = modelInfo.topK();
|
||||
const int n_batch = modelInfo.promptBatchSize();
|
||||
const float repeat_penalty = modelInfo.repeatPenalty();
|
||||
const auto repeat_penalty = float(modelInfo.repeatPenalty());
|
||||
const int repeat_last_n = modelInfo.repeatPenaltyTokens();
|
||||
|
||||
int promptTokens = 0;
|
||||
@ -671,7 +672,7 @@ auto Server::handleCompletionRequest(const CompletionRequest &request)
|
||||
if (!promptInternal(
|
||||
m_collections,
|
||||
request.prompt,
|
||||
promptTemplate,
|
||||
/*promptTemplate*/ u"%1"_s,
|
||||
request.max_tokens,
|
||||
top_k,
|
||||
request.top_p,
|
||||
@ -684,22 +685,23 @@ auto Server::handleCompletionRequest(const CompletionRequest &request)
|
||||
std::cerr << "ERROR: couldn't prompt model " << modelInfo.name().toStdString() << std::endl;
|
||||
return makeError(QHttpServerResponder::StatusCode::InternalServerError);
|
||||
}
|
||||
QString echoedPrompt = request.prompt;
|
||||
if (!echoedPrompt.endsWith("\n"))
|
||||
echoedPrompt += "\n";
|
||||
responses.append(qMakePair((request.echo ? u"%1\n"_s.arg(request.prompt) : QString()) + response(), m_databaseResults));
|
||||
QString resp = response(/*trim*/ false);
|
||||
if (request.echo)
|
||||
resp = request.prompt + resp;
|
||||
responses.append({resp, m_databaseResults});
|
||||
if (!promptTokens)
|
||||
promptTokens += m_promptTokens;
|
||||
promptTokens = m_promptTokens;
|
||||
responseTokens += m_promptResponseTokens - m_promptTokens;
|
||||
if (i < request.n - 1)
|
||||
resetResponse();
|
||||
}
|
||||
|
||||
QJsonObject responseObject;
|
||||
responseObject.insert("id", "foobarbaz");
|
||||
responseObject.insert("object", "text_completion");
|
||||
responseObject.insert("created", QDateTime::currentSecsSinceEpoch());
|
||||
responseObject.insert("model", modelInfo.name());
|
||||
QJsonObject responseObject {
|
||||
{ "id", "placeholder" },
|
||||
{ "object", "text_completion" },
|
||||
{ "created", QDateTime::currentSecsSinceEpoch() },
|
||||
{ "model", modelInfo.name() },
|
||||
};
|
||||
|
||||
QJsonArray choices;
|
||||
{
|
||||
@ -707,28 +709,28 @@ auto Server::handleCompletionRequest(const CompletionRequest &request)
|
||||
for (const auto &r : responses) {
|
||||
QString result = r.first;
|
||||
QList<ResultInfo> infos = r.second;
|
||||
QJsonObject choice;
|
||||
choice.insert("text", result);
|
||||
choice.insert("index", index++);
|
||||
choice.insert("logprobs", QJsonValue::Null); // We don't support
|
||||
choice.insert("finish_reason", responseTokens == request.max_tokens ? "length" : "stop");
|
||||
QJsonObject choice {
|
||||
{ "text", result },
|
||||
{ "index", index++ },
|
||||
{ "logprobs", QJsonValue::Null },
|
||||
{ "finish_reason", responseTokens == request.max_tokens ? "length" : "stop" },
|
||||
};
|
||||
if (MySettings::globalInstance()->localDocsShowReferences()) {
|
||||
QJsonArray references;
|
||||
for (const auto &ref : infos)
|
||||
references.append(resultToJson(ref));
|
||||
choice.insert("references", references);
|
||||
choice.insert("references", references.isEmpty() ? QJsonValue::Null : QJsonValue(references));
|
||||
}
|
||||
choices.append(choice);
|
||||
}
|
||||
}
|
||||
|
||||
responseObject.insert("choices", choices);
|
||||
|
||||
QJsonObject usage;
|
||||
usage.insert("prompt_tokens", int(promptTokens));
|
||||
usage.insert("completion_tokens", int(responseTokens));
|
||||
usage.insert("total_tokens", int(promptTokens + responseTokens));
|
||||
responseObject.insert("usage", usage);
|
||||
responseObject.insert("usage", QJsonObject {
|
||||
{ "prompt_tokens", promptTokens },
|
||||
{ "completion_tokens", responseTokens },
|
||||
{ "total_tokens", promptTokens + responseTokens },
|
||||
});
|
||||
|
||||
return {QHttpServerResponse(responseObject), responseObject};
|
||||
}
|
||||
@ -748,32 +750,12 @@ auto Server::handleChatRequest(const ChatRequest &request)
|
||||
}
|
||||
}
|
||||
|
||||
// if we're a chat completion we have messages which means we need to use these as the prompt
|
||||
QString actualPrompt = " ";
|
||||
{
|
||||
QList<QString> chats;
|
||||
for (int i = 0; i < request.messages.count(); i++) {
|
||||
auto &m = request.messages.at(i);
|
||||
// FIXME: Deal with system messages correctly
|
||||
if (m.role != ChatRequest::Message::Role::User)
|
||||
continue;
|
||||
QString content = m.content;
|
||||
if (!content.endsWith("\n") && i < request.messages.count() - 1)
|
||||
content += "\n";
|
||||
chats.append(content);
|
||||
}
|
||||
actualPrompt.prepend(chats.join("\n"));
|
||||
}
|
||||
|
||||
// adds prompt/response items to GUI
|
||||
emit requestServerNewPromptResponsePair(actualPrompt); // blocks
|
||||
|
||||
// load the new model if necessary
|
||||
setShouldBeLoaded(true);
|
||||
|
||||
if (modelInfo.filename().isEmpty()) {
|
||||
std::cerr << "ERROR: couldn't load default model " << request.model.toStdString() << std::endl;
|
||||
return makeError(QHttpServerResponder::StatusCode::BadRequest);
|
||||
return makeError(QHttpServerResponder::StatusCode::InternalServerError);
|
||||
}
|
||||
|
||||
// NB: this resets the context, regardless of whether this model is already loaded
|
||||
@ -783,18 +765,30 @@ auto Server::handleChatRequest(const ChatRequest &request)
|
||||
}
|
||||
|
||||
const QString promptTemplate = modelInfo.promptTemplate();
|
||||
const float top_k = modelInfo.topK();
|
||||
const int top_k = modelInfo.topK();
|
||||
const int n_batch = modelInfo.promptBatchSize();
|
||||
const float repeat_penalty = modelInfo.repeatPenalty();
|
||||
const auto repeat_penalty = float(modelInfo.repeatPenalty());
|
||||
const int repeat_last_n = modelInfo.repeatPenaltyTokens();
|
||||
|
||||
int promptTokens = 0;
|
||||
int responseTokens = 0;
|
||||
QList<QPair<QString, QList<ResultInfo>>> responses;
|
||||
for (int i = 0; i < request.n; ++i) {
|
||||
Q_ASSERT(!request.messages.isEmpty());
|
||||
Q_ASSERT(request.messages.size() % 2 == 1);
|
||||
for (int i = 0; i < request.messages.size() - 2; i += 2) {
|
||||
using enum ChatRequest::Message::Role;
|
||||
auto &user = request.messages[i];
|
||||
auto &assistant = request.messages[i + 1];
|
||||
Q_ASSERT(user.role == User);
|
||||
Q_ASSERT(assistant.role == Assistant);
|
||||
|
||||
// adds prompt/response items to GUI
|
||||
emit requestServerNewPromptResponsePair(user.content); // blocks
|
||||
resetResponse();
|
||||
|
||||
if (!promptInternal(
|
||||
m_collections,
|
||||
actualPrompt,
|
||||
{},
|
||||
user.content,
|
||||
promptTemplate,
|
||||
request.max_tokens,
|
||||
top_k,
|
||||
@ -803,24 +797,52 @@ auto Server::handleChatRequest(const ChatRequest &request)
|
||||
request.temperature,
|
||||
n_batch,
|
||||
repeat_penalty,
|
||||
repeat_last_n)) {
|
||||
|
||||
repeat_last_n,
|
||||
assistant.content)
|
||||
) {
|
||||
std::cerr << "ERROR: couldn't prompt model " << modelInfo.name().toStdString() << std::endl;
|
||||
return makeError(QHttpServerResponder::StatusCode::InternalServerError);
|
||||
}
|
||||
responses.append(qMakePair(response(), m_databaseResults));
|
||||
if (!promptTokens)
|
||||
promptTokens += m_promptResponseTokens; // previous responses are part of current prompt
|
||||
}
|
||||
|
||||
QString lastMessage = request.messages.last().content;
|
||||
// adds prompt/response items to GUI
|
||||
emit requestServerNewPromptResponsePair(lastMessage); // blocks
|
||||
resetResponse();
|
||||
|
||||
for (int i = 0; i < request.n; ++i) {
|
||||
if (!promptInternal(
|
||||
m_collections,
|
||||
lastMessage,
|
||||
promptTemplate,
|
||||
request.max_tokens,
|
||||
top_k,
|
||||
request.top_p,
|
||||
request.min_p,
|
||||
request.temperature,
|
||||
n_batch,
|
||||
repeat_penalty,
|
||||
repeat_last_n)
|
||||
) {
|
||||
std::cerr << "ERROR: couldn't prompt model " << modelInfo.name().toStdString() << std::endl;
|
||||
return makeError(QHttpServerResponder::StatusCode::InternalServerError);
|
||||
}
|
||||
responses.append({response(), m_databaseResults});
|
||||
// FIXME(jared): these are UI counts and do not include framing tokens, which they should
|
||||
if (i == 0)
|
||||
promptTokens += m_promptTokens;
|
||||
responseTokens += m_promptResponseTokens - m_promptTokens;
|
||||
if (i < request.n - 1)
|
||||
if (i != request.n - 1)
|
||||
resetResponse();
|
||||
}
|
||||
|
||||
QJsonObject responseObject;
|
||||
responseObject.insert("id", "foobarbaz");
|
||||
responseObject.insert("object", "text_completion");
|
||||
responseObject.insert("created", QDateTime::currentSecsSinceEpoch());
|
||||
responseObject.insert("model", modelInfo.name());
|
||||
QJsonObject responseObject {
|
||||
{ "id", "placeholder" },
|
||||
{ "object", "chat.completion" },
|
||||
{ "created", QDateTime::currentSecsSinceEpoch() },
|
||||
{ "model", modelInfo.name() },
|
||||
};
|
||||
|
||||
QJsonArray choices;
|
||||
{
|
||||
@ -828,30 +850,32 @@ auto Server::handleChatRequest(const ChatRequest &request)
|
||||
for (const auto &r : responses) {
|
||||
QString result = r.first;
|
||||
QList<ResultInfo> infos = r.second;
|
||||
QJsonObject choice;
|
||||
choice.insert("index", index++);
|
||||
choice.insert("finish_reason", responseTokens == request.max_tokens ? "length" : "stop");
|
||||
QJsonObject message;
|
||||
message.insert("role", "assistant");
|
||||
message.insert("content", result);
|
||||
choice.insert("message", message);
|
||||
QJsonObject message {
|
||||
{ "role", "assistant" },
|
||||
{ "content", result },
|
||||
};
|
||||
QJsonObject choice {
|
||||
{ "index", index++ },
|
||||
{ "message", message },
|
||||
{ "finish_reason", responseTokens == request.max_tokens ? "length" : "stop" },
|
||||
{ "logprobs", QJsonValue::Null },
|
||||
};
|
||||
if (MySettings::globalInstance()->localDocsShowReferences()) {
|
||||
QJsonArray references;
|
||||
for (const auto &ref : infos)
|
||||
references.append(resultToJson(ref));
|
||||
choice.insert("references", references);
|
||||
choice.insert("references", references.isEmpty() ? QJsonValue::Null : QJsonValue(references));
|
||||
}
|
||||
choices.append(choice);
|
||||
}
|
||||
}
|
||||
|
||||
responseObject.insert("choices", choices);
|
||||
|
||||
QJsonObject usage;
|
||||
usage.insert("prompt_tokens", int(promptTokens));
|
||||
usage.insert("completion_tokens", int(responseTokens));
|
||||
usage.insert("total_tokens", int(promptTokens + responseTokens));
|
||||
responseObject.insert("usage", usage);
|
||||
responseObject.insert("usage", QJsonObject {
|
||||
{ "prompt_tokens", promptTokens },
|
||||
{ "completion_tokens", responseTokens },
|
||||
{ "total_tokens", promptTokens + responseTokens },
|
||||
});
|
||||
|
||||
return {QHttpServerResponse(responseObject), responseObject};
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user