mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2024-10-01 01:06:10 -04:00
1296 lines
45 KiB
C++
1296 lines
45 KiB
C++
#include "llamacpp_model.h"
|
|
|
|
#include "chat.h"
|
|
#include "chatapi.h"
|
|
#include "localdocs.h"
|
|
#include "mysettings.h"
|
|
#include "network.h"
|
|
|
|
#include "../gpt4all-backend/llamacpp_backend_manager.h"
|
|
|
|
#include <QDataStream>
|
|
#include <QDebug>
|
|
#include <QFile>
|
|
#include <QGlobalStatic>
|
|
#include <QIODevice>
|
|
#include <QJsonDocument>
|
|
#include <QJsonObject>
|
|
#include <QMutex>
|
|
#include <QMutexLocker>
|
|
#include <QSet>
|
|
#include <QStringList>
|
|
#include <QWaitCondition>
|
|
#include <Qt>
|
|
#include <QtLogging>
|
|
|
|
#include <algorithm>
|
|
#include <cctype>
|
|
#include <cmath>
|
|
#include <cstddef>
|
|
#include <functional>
|
|
#include <limits>
|
|
#include <optional>
|
|
#include <string_view>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
using namespace Qt::Literals::StringLiterals;
|
|
|
|
//#define DEBUG
|
|
//#define DEBUG_MODEL_LOADING
|
|
|
|
#define GPTJ_INTERNAL_STATE_VERSION 0 // GPT-J is gone but old chats still use this
|
|
#define LLAMA_INTERNAL_STATE_VERSION 0
|
|
|
|
class LLModelStore {
|
|
public:
|
|
static LLModelStore *globalInstance();
|
|
|
|
LLModelInfo acquireModel(); // will block until llmodel is ready
|
|
void releaseModel(LLModelInfo &&info); // must be called when you are done
|
|
void destroy();
|
|
|
|
private:
|
|
LLModelStore()
|
|
{
|
|
// seed with empty model
|
|
m_availableModel = LLModelInfo();
|
|
}
|
|
~LLModelStore() {}
|
|
std::optional<LLModelInfo> m_availableModel;
|
|
QMutex m_mutex;
|
|
QWaitCondition m_condition;
|
|
friend class MyLLModelStore;
|
|
};
|
|
|
|
class MyLLModelStore : public LLModelStore { };
|
|
Q_GLOBAL_STATIC(MyLLModelStore, storeInstance)
|
|
LLModelStore *LLModelStore::globalInstance()
|
|
{
|
|
return storeInstance();
|
|
}
|
|
|
|
LLModelInfo LLModelStore::acquireModel()
|
|
{
|
|
QMutexLocker locker(&m_mutex);
|
|
while (!m_availableModel)
|
|
m_condition.wait(locker.mutex());
|
|
auto first = std::move(*m_availableModel);
|
|
m_availableModel.reset();
|
|
return first;
|
|
}
|
|
|
|
void LLModelStore::releaseModel(LLModelInfo &&info)
|
|
{
|
|
QMutexLocker locker(&m_mutex);
|
|
Q_ASSERT(!m_availableModel);
|
|
m_availableModel = std::move(info);
|
|
m_condition.wakeAll();
|
|
}
|
|
|
|
void LLModelStore::destroy()
|
|
{
|
|
QMutexLocker locker(&m_mutex);
|
|
m_availableModel.reset();
|
|
}
|
|
|
|
void LLModelInfo::resetModel(LlamaCppModel *cllm, ModelBackend *model)
|
|
{
|
|
this->model.reset(model);
|
|
fallbackReason.reset();
|
|
emit cllm->loadedModelInfoChanged();
|
|
}
|
|
|
|
LlamaCppModel::LlamaCppModel(Chat *parent, bool isServer)
|
|
: m_promptResponseTokens(0)
|
|
, m_promptTokens(0)
|
|
, m_restoringFromText(false)
|
|
, m_shouldBeLoaded(false)
|
|
, m_markedForDeletion(false)
|
|
, m_stopGenerating(false)
|
|
, m_timer(nullptr)
|
|
, m_isServer(isServer)
|
|
, m_forceMetal(MySettings::globalInstance()->forceMetal())
|
|
, m_reloadingToChangeVariant(false)
|
|
, m_processedSystemPrompt(false)
|
|
, m_restoreStateFromText(false)
|
|
{
|
|
moveToThread(&m_llmThread);
|
|
connect<void(LlamaCppModel::*)(bool), void(LlamaCppModel::*)(bool)>(
|
|
this, &LlamaCppModel::requestLoadModel, this, &LlamaCppModel::loadModel
|
|
);
|
|
connect(this, &LlamaCppModel::requestReleaseModel, this, &LlamaCppModel::releaseModel);
|
|
connect(this, &LlamaCppModel::trySwitchContextRequested, this, &LlamaCppModel::trySwitchContextOfLoadedModel,
|
|
Qt::QueuedConnection); // explicitly queued
|
|
connect(parent, &Chat::idChanged, this, &LlamaCppModel::handleChatIdChanged);
|
|
connect(&m_llmThread, &QThread::started, this, &LlamaCppModel::handleThreadStarted);
|
|
connect(MySettings::globalInstance(), &MySettings::forceMetalChanged, this, &LlamaCppModel::handleForceMetalChanged);
|
|
connect(MySettings::globalInstance(), &MySettings::deviceChanged, this, &LlamaCppModel::handleDeviceChanged);
|
|
|
|
// The following are blocking operations and will block the llm thread
|
|
connect(this, &LlamaCppModel::requestRetrieveFromDB, LocalDocs::globalInstance()->database(), &Database::retrieveFromDB,
|
|
Qt::BlockingQueuedConnection);
|
|
|
|
m_llmThread.setObjectName(parent->id());
|
|
m_llmThread.start();
|
|
}
|
|
|
|
LlamaCppModel::~LlamaCppModel()
|
|
{
|
|
destroy();
|
|
}
|
|
|
|
void LlamaCppModel::destroy()
|
|
{
|
|
m_stopGenerating = true;
|
|
m_llmThread.quit();
|
|
m_llmThread.wait();
|
|
|
|
// The only time we should have a model loaded here is on shutdown
|
|
// as we explicitly unload the model in all other circumstances
|
|
if (isModelLoaded()) {
|
|
m_llModelInfo.resetModel(this);
|
|
}
|
|
}
|
|
|
|
void LlamaCppModel::destroyStore()
|
|
{
|
|
LLModelStore::globalInstance()->destroy();
|
|
}
|
|
|
|
void LlamaCppModel::handleThreadStarted()
|
|
{
|
|
m_timer = new TokenTimer(this);
|
|
connect(m_timer, &TokenTimer::report, this, &LlamaCppModel::reportSpeed);
|
|
emit threadStarted();
|
|
}
|
|
|
|
void LlamaCppModel::handleForceMetalChanged(bool forceMetal)
|
|
{
|
|
#if defined(Q_OS_MAC) && defined(__aarch64__)
|
|
m_forceMetal = forceMetal;
|
|
if (isModelLoaded() && m_shouldBeLoaded) {
|
|
m_reloadingToChangeVariant = true;
|
|
loadModel(/*reload*/ true);
|
|
m_reloadingToChangeVariant = false;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void LlamaCppModel::handleDeviceChanged()
|
|
{
|
|
if (isModelLoaded() && m_shouldBeLoaded) {
|
|
m_reloadingToChangeVariant = true;
|
|
loadModel(/*reload*/ true);
|
|
m_reloadingToChangeVariant = false;
|
|
}
|
|
}
|
|
|
|
void LlamaCppModel::trySwitchContextOfLoadedModel(const ModelInfo &modelInfo)
|
|
{
|
|
// We're trying to see if the store already has the model fully loaded that we wish to use
|
|
// and if so we just acquire it from the store and switch the context and return true. If the
|
|
// store doesn't have it or we're already loaded or in any other case just return false.
|
|
|
|
// If we're already loaded or a server or we're reloading to change the variant/device or the
|
|
// modelInfo is empty, then this should fail
|
|
if (
|
|
isModelLoaded() || m_isServer || m_reloadingToChangeVariant || modelInfo.name().isEmpty() || !m_shouldBeLoaded
|
|
) {
|
|
emit trySwitchContextOfLoadedModelCompleted(0);
|
|
return;
|
|
}
|
|
|
|
QString filePath = modelInfo.dirpath + modelInfo.filename();
|
|
QFileInfo fileInfo(filePath);
|
|
|
|
acquireModel();
|
|
#if defined(DEBUG_MODEL_LOADING)
|
|
qDebug() << "acquired model from store" << m_llmThread.objectName() << m_llModelInfo.model.get();
|
|
#endif
|
|
|
|
// The store gave us no already loaded model, the wrong type of model, then give it back to the
|
|
// store and fail
|
|
if (!m_llModelInfo.model || m_llModelInfo.fileInfo != fileInfo || !m_shouldBeLoaded) {
|
|
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
|
emit trySwitchContextOfLoadedModelCompleted(0);
|
|
return;
|
|
}
|
|
|
|
#if defined(DEBUG_MODEL_LOADING)
|
|
qDebug() << "store had our model" << m_llmThread.objectName() << m_llModelInfo.model.get();
|
|
#endif
|
|
|
|
emit trySwitchContextOfLoadedModelCompleted(2);
|
|
|
|
// Restore, signal and process
|
|
restoreState();
|
|
emit modelLoadingPercentageChanged(1.0f);
|
|
emit trySwitchContextOfLoadedModelCompleted(0);
|
|
processSystemPrompt();
|
|
}
|
|
|
|
bool LlamaCppModel::loadModel(const ModelInfo &modelInfo)
|
|
{
|
|
// This is a complicated method because N different possible threads are interested in the outcome
|
|
// of this method. Why? Because we have a main/gui thread trying to monitor the state of N different
|
|
// possible chat threads all vying for a single resource - the currently loaded model - as the user
|
|
// switches back and forth between chats. It is important for our main/gui thread to never block
|
|
// but simultaneously always have up2date information with regards to which chat has the model loaded
|
|
// and what the type and name of that model is. I've tried to comment extensively in this method
|
|
// to provide an overview of what we're doing here.
|
|
|
|
// We're already loaded with this model
|
|
if (isModelLoaded() && this->modelInfo() == modelInfo)
|
|
return true;
|
|
|
|
// reset status
|
|
emit modelLoadingPercentageChanged(std::numeric_limits<float>::min()); // small non-zero positive value
|
|
emit modelLoadingError("");
|
|
m_pristineLoadedState = false;
|
|
|
|
QString filePath = modelInfo.dirpath + modelInfo.filename();
|
|
QFileInfo fileInfo(filePath);
|
|
|
|
// We have a live model, but it isn't the one we want
|
|
bool alreadyAcquired = isModelLoaded();
|
|
if (alreadyAcquired) {
|
|
resetContext();
|
|
#if defined(DEBUG_MODEL_LOADING)
|
|
qDebug() << "already acquired model deleted" << m_llmThread.objectName() << m_llModelInfo.model.get();
|
|
#endif
|
|
m_llModelInfo.resetModel(this);
|
|
} else if (!m_isServer) {
|
|
// This is a blocking call that tries to retrieve the model we need from the model store.
|
|
// If it succeeds, then we just have to restore state. If the store has never had a model
|
|
// returned to it, then the modelInfo.model pointer should be null which will happen on startup
|
|
acquireModel();
|
|
#if defined(DEBUG_MODEL_LOADING)
|
|
qDebug() << "acquired model from store" << m_llmThread.objectName() << m_llModelInfo.model.get();
|
|
#endif
|
|
// At this point it is possible that while we were blocked waiting to acquire the model from the
|
|
// store, that our state was changed to not be loaded. If this is the case, release the model
|
|
// back into the store and quit loading
|
|
if (!m_shouldBeLoaded) {
|
|
#if defined(DEBUG_MODEL_LOADING)
|
|
qDebug() << "no longer need model" << m_llmThread.objectName() << m_llModelInfo.model.get();
|
|
#endif
|
|
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
|
emit modelLoadingPercentageChanged(0.0f);
|
|
return false;
|
|
}
|
|
|
|
// Check if the store just gave us exactly the model we were looking for
|
|
if (m_llModelInfo.model && m_llModelInfo.fileInfo == fileInfo && !m_reloadingToChangeVariant) {
|
|
#if defined(DEBUG_MODEL_LOADING)
|
|
qDebug() << "store had our model" << m_llmThread.objectName() << m_llModelInfo.model.get();
|
|
#endif
|
|
restoreState();
|
|
emit modelLoadingPercentageChanged(1.0f);
|
|
setModelInfo(modelInfo);
|
|
Q_ASSERT(!m_modelInfo.filename().isEmpty());
|
|
if (m_modelInfo.filename().isEmpty())
|
|
emit modelLoadingError(u"Modelinfo is left null for %1"_s.arg(modelInfo.filename()));
|
|
else
|
|
processSystemPrompt();
|
|
return true;
|
|
} else {
|
|
// Release the memory since we have to switch to a different model.
|
|
#if defined(DEBUG_MODEL_LOADING)
|
|
qDebug() << "deleting model" << m_llmThread.objectName() << m_llModelInfo.model.get();
|
|
#endif
|
|
m_llModelInfo.resetModel(this);
|
|
}
|
|
}
|
|
|
|
// Guarantee we've released the previous models memory
|
|
Q_ASSERT(!m_llModelInfo.model);
|
|
|
|
// Store the file info in the modelInfo in case we have an error loading
|
|
m_llModelInfo.fileInfo = fileInfo;
|
|
|
|
if (fileInfo.exists()) {
|
|
QVariantMap modelLoadProps;
|
|
if (modelInfo.isOnline) {
|
|
QString apiKey;
|
|
QString requestUrl;
|
|
QString modelName;
|
|
{
|
|
QFile file(filePath);
|
|
bool success = file.open(QIODeviceBase::ReadOnly);
|
|
(void)success;
|
|
Q_ASSERT(success);
|
|
QJsonDocument doc = QJsonDocument::fromJson(file.readAll());
|
|
QJsonObject obj = doc.object();
|
|
apiKey = obj["apiKey"].toString();
|
|
modelName = obj["modelName"].toString();
|
|
if (modelInfo.isCompatibleApi) {
|
|
QString baseUrl(obj["baseUrl"].toString());
|
|
QUrl apiUrl(QUrl::fromUserInput(baseUrl));
|
|
if (!Network::isHttpUrlValid(apiUrl)) {
|
|
return false;
|
|
}
|
|
QString currentPath(apiUrl.path());
|
|
QString suffixPath("%1/chat/completions");
|
|
apiUrl.setPath(suffixPath.arg(currentPath));
|
|
requestUrl = apiUrl.toString();
|
|
} else {
|
|
requestUrl = modelInfo.url();
|
|
}
|
|
}
|
|
m_llModelType = LLModelType::API_;
|
|
ChatAPI *model = new ChatAPI();
|
|
model->setModelName(modelName);
|
|
model->setRequestURL(requestUrl);
|
|
model->setAPIKey(apiKey);
|
|
m_llModelInfo.resetModel(this, model);
|
|
} else if (!loadNewModel(modelInfo, modelLoadProps)) {
|
|
return false; // m_shouldBeLoaded became false
|
|
}
|
|
#if defined(DEBUG_MODEL_LOADING)
|
|
qDebug() << "new model" << m_llmThread.objectName() << m_llModelInfo.model.get();
|
|
#endif
|
|
restoreState();
|
|
#if defined(DEBUG)
|
|
qDebug() << "modelLoadedChanged" << m_llmThread.objectName();
|
|
fflush(stdout);
|
|
#endif
|
|
emit modelLoadingPercentageChanged(isModelLoaded() ? 1.0f : 0.0f);
|
|
emit loadedModelInfoChanged();
|
|
|
|
modelLoadProps.insert("requestedDevice", MySettings::globalInstance()->device());
|
|
modelLoadProps.insert("model", modelInfo.filename());
|
|
Network::globalInstance()->trackChatEvent("model_load", modelLoadProps);
|
|
} else {
|
|
if (!m_isServer)
|
|
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo)); // release back into the store
|
|
resetModel();
|
|
emit modelLoadingError(u"Could not find file for model %1"_s.arg(modelInfo.filename()));
|
|
}
|
|
|
|
if (m_llModelInfo.model) {
|
|
setModelInfo(modelInfo);
|
|
processSystemPrompt();
|
|
}
|
|
return bool(m_llModelInfo.model);
|
|
}
|
|
|
|
/* Returns false if the model should no longer be loaded (!m_shouldBeLoaded).
|
|
* Otherwise returns true, even on error. */
|
|
bool LlamaCppModel::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadProps)
|
|
{
|
|
QElapsedTimer modelLoadTimer;
|
|
modelLoadTimer.start();
|
|
|
|
QString requestedDevice = MySettings::globalInstance()->device();
|
|
int n_ctx = MySettings::globalInstance()->modelContextLength(modelInfo);
|
|
m_ctx.n_ctx = n_ctx;
|
|
int ngl = MySettings::globalInstance()->modelGpuLayers(modelInfo);
|
|
|
|
std::string backend = "auto";
|
|
#ifdef Q_OS_MAC
|
|
if (requestedDevice == "CPU") {
|
|
backend = "cpu";
|
|
} else if (m_forceMetal) {
|
|
#ifdef __aarch64__
|
|
backend = "metal";
|
|
#endif
|
|
}
|
|
#else // !defined(Q_OS_MAC)
|
|
if (requestedDevice.startsWith("CUDA: "))
|
|
backend = "cuda";
|
|
#endif
|
|
|
|
QString filePath = modelInfo.dirpath + modelInfo.filename();
|
|
|
|
auto construct = [this, &filePath, &modelInfo, &modelLoadProps, n_ctx](std::string const &backend) -> LlamaCppBackend * {
|
|
LlamaCppBackend *lcppmodel;
|
|
QString constructError;
|
|
m_llModelInfo.resetModel(this);
|
|
try {
|
|
lcppmodel = LlamaCppBackendManager::construct(filePath.toStdString(), backend, n_ctx);
|
|
m_llModelInfo.resetModel(this, lcppmodel);
|
|
} catch (const LlamaCppBackendManager::MissingImplementationError &e) {
|
|
modelLoadProps.insert("error", "missing_model_impl");
|
|
constructError = e.what();
|
|
} catch (const LlamaCppBackendManager::UnsupportedModelError &e) {
|
|
modelLoadProps.insert("error", "unsupported_model_file");
|
|
constructError = e.what();
|
|
} catch (const LlamaCppBackendManager::BadArchError &e) {
|
|
constructError = e.what();
|
|
modelLoadProps.insert("error", "unsupported_model_arch");
|
|
modelLoadProps.insert("model_arch", QString::fromStdString(e.arch()));
|
|
}
|
|
|
|
if (!m_llModelInfo.model) {
|
|
if (!m_isServer)
|
|
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
|
resetModel();
|
|
emit modelLoadingError(u"Error loading %1: %2"_s.arg(modelInfo.filename(), constructError));
|
|
return nullptr;
|
|
}
|
|
|
|
lcppmodel->setProgressCallback([this](float progress) -> bool {
|
|
progress = std::max(progress, std::numeric_limits<float>::min()); // keep progress above zero
|
|
emit modelLoadingPercentageChanged(progress);
|
|
return m_shouldBeLoaded;
|
|
});
|
|
return lcppmodel;
|
|
};
|
|
|
|
auto *lcppmodel = construct(backend);
|
|
if (!lcppmodel)
|
|
return true;
|
|
|
|
if (lcppmodel->isModelBlacklisted(filePath.toStdString())) {
|
|
static QSet<QString> warned;
|
|
auto fname = modelInfo.filename();
|
|
if (!warned.contains(fname)) {
|
|
emit modelLoadingWarning(
|
|
u"%1 is known to be broken. Please get a replacement via the download dialog."_s.arg(fname)
|
|
);
|
|
warned.insert(fname); // don't warn again until restart
|
|
}
|
|
}
|
|
|
|
auto approxDeviceMemGB = [](const LlamaCppBackend::GPUDevice *dev) {
|
|
float memGB = dev->heapSize / float(1024 * 1024 * 1024);
|
|
return std::floor(memGB * 10.f) / 10.f; // truncate to 1 decimal place
|
|
};
|
|
|
|
std::vector<LlamaCppBackend::GPUDevice> availableDevices;
|
|
const LlamaCppBackend::GPUDevice *defaultDevice = nullptr;
|
|
{
|
|
const size_t requiredMemory = lcppmodel->requiredMem(filePath.toStdString(), n_ctx, ngl);
|
|
availableDevices = lcppmodel->availableGPUDevices(requiredMemory);
|
|
// Pick the best device
|
|
// NB: relies on the fact that Kompute devices are listed first
|
|
if (!availableDevices.empty() && availableDevices.front().type == 2 /*a discrete gpu*/) {
|
|
defaultDevice = &availableDevices.front();
|
|
float memGB = defaultDevice->heapSize / float(1024 * 1024 * 1024);
|
|
memGB = std::floor(memGB * 10.f) / 10.f; // truncate to 1 decimal place
|
|
modelLoadProps.insert("default_device", QString::fromStdString(defaultDevice->name));
|
|
modelLoadProps.insert("default_device_mem", approxDeviceMemGB(defaultDevice));
|
|
modelLoadProps.insert("default_device_backend", QString::fromStdString(defaultDevice->backendName()));
|
|
}
|
|
}
|
|
|
|
bool actualDeviceIsCPU = true;
|
|
|
|
#if defined(Q_OS_MAC) && defined(__aarch64__)
|
|
if (lcppmodel->manager().buildVariant() == "metal")
|
|
actualDeviceIsCPU = false;
|
|
#else
|
|
if (requestedDevice != "CPU") {
|
|
const auto *device = defaultDevice;
|
|
if (requestedDevice != "Auto") {
|
|
// Use the selected device
|
|
for (const auto &d : availableDevices) {
|
|
if (QString::fromStdString(d.selectionName()) == requestedDevice) {
|
|
device = &d;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
std::string unavail_reason;
|
|
if (!device) {
|
|
// GPU not available
|
|
} else if (!lcppmodel->initializeGPUDevice(device->index, &unavail_reason)) {
|
|
m_llModelInfo.fallbackReason = QString::fromStdString(unavail_reason);
|
|
} else {
|
|
actualDeviceIsCPU = false;
|
|
modelLoadProps.insert("requested_device_mem", approxDeviceMemGB(device));
|
|
}
|
|
}
|
|
#endif
|
|
|
|
bool success = lcppmodel->loadModel(filePath.toStdString(), n_ctx, ngl);
|
|
|
|
if (!m_shouldBeLoaded) {
|
|
m_llModelInfo.resetModel(this);
|
|
if (!m_isServer)
|
|
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
|
resetModel();
|
|
emit modelLoadingPercentageChanged(0.0f);
|
|
return false;
|
|
}
|
|
|
|
if (actualDeviceIsCPU) {
|
|
// we asked llama.cpp to use the CPU
|
|
} else if (!success) {
|
|
// llama_init_from_file returned nullptr
|
|
m_llModelInfo.fallbackReason = "GPU loading failed (out of VRAM?)";
|
|
modelLoadProps.insert("cpu_fallback_reason", "gpu_load_failed");
|
|
|
|
// For CUDA, make sure we don't use the GPU at all - ngl=0 still offloads matmuls
|
|
if (backend == "cuda") {
|
|
lcppmodel = construct("auto");
|
|
if (!lcppmodel)
|
|
return true;
|
|
}
|
|
|
|
success = lcppmodel->loadModel(filePath.toStdString(), n_ctx, 0);
|
|
|
|
if (!m_shouldBeLoaded) {
|
|
m_llModelInfo.resetModel(this);
|
|
if (!m_isServer)
|
|
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
|
resetModel();
|
|
emit modelLoadingPercentageChanged(0.0f);
|
|
return false;
|
|
}
|
|
} else if (!lcppmodel->usingGPUDevice()) {
|
|
// ggml_vk_init was not called in llama.cpp
|
|
// We might have had to fallback to CPU after load if the model is not possible to accelerate
|
|
// for instance if the quantization method is not supported on Vulkan yet
|
|
m_llModelInfo.fallbackReason = "model or quant has no GPU support";
|
|
modelLoadProps.insert("cpu_fallback_reason", "gpu_unsupported_model");
|
|
}
|
|
|
|
if (!success) {
|
|
m_llModelInfo.resetModel(this);
|
|
if (!m_isServer)
|
|
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
|
resetModel();
|
|
emit modelLoadingError(u"Could not load model due to invalid model file for %1"_s.arg(modelInfo.filename()));
|
|
modelLoadProps.insert("error", "loadmodel_failed");
|
|
return true;
|
|
}
|
|
|
|
switch (lcppmodel->manager().modelType()[0]) {
|
|
case 'L': m_llModelType = LLModelType::LLAMA_; break;
|
|
default:
|
|
{
|
|
m_llModelInfo.resetModel(this);
|
|
if (!m_isServer)
|
|
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
|
resetModel();
|
|
emit modelLoadingError(u"Could not determine model type for %1"_s.arg(modelInfo.filename()));
|
|
}
|
|
}
|
|
|
|
modelLoadProps.insert("$duration", modelLoadTimer.elapsed() / 1000.);
|
|
return true;
|
|
}
|
|
|
|
bool LlamaCppModel::isModelLoaded() const
|
|
{
|
|
return m_llModelInfo.model && m_llModelInfo.model->isModelLoaded();
|
|
}
|
|
|
|
// FIXME(jared): we don't actually have to re-decode the prompt to generate a new response
|
|
void LlamaCppModel::regenerateResponse()
|
|
{
|
|
// ChatGPT uses a different semantic meaning for n_past than local models. For ChatGPT, the meaning
|
|
// of n_past is of the number of prompt/response pairs, rather than for total tokens.
|
|
if (m_llModelType == LLModelType::API_)
|
|
m_ctx.n_past -= 1;
|
|
else
|
|
m_ctx.n_past -= m_promptResponseTokens;
|
|
m_ctx.n_past = std::max(0, m_ctx.n_past);
|
|
m_ctx.tokens.erase(m_ctx.tokens.end() - m_promptResponseTokens, m_ctx.tokens.end());
|
|
m_promptResponseTokens = 0;
|
|
m_promptTokens = 0;
|
|
m_response = std::string();
|
|
emit responseChanged(QString::fromStdString(m_response));
|
|
}
|
|
|
|
void LlamaCppModel::resetResponse()
|
|
{
|
|
m_promptTokens = 0;
|
|
m_promptResponseTokens = 0;
|
|
m_response = std::string();
|
|
emit responseChanged(QString::fromStdString(m_response));
|
|
}
|
|
|
|
void LlamaCppModel::resetContext()
|
|
{
|
|
resetResponse();
|
|
m_processedSystemPrompt = false;
|
|
m_ctx = ModelBackend::PromptContext();
|
|
}
|
|
|
|
QString LlamaCppModel::response() const
|
|
{
|
|
return QString::fromStdString(remove_leading_whitespace(m_response));
|
|
}
|
|
|
|
void LlamaCppModel::setModelInfo(const ModelInfo &modelInfo)
|
|
{
|
|
m_modelInfo = modelInfo;
|
|
emit modelInfoChanged(modelInfo);
|
|
}
|
|
|
|
void LlamaCppModel::acquireModel()
|
|
{
|
|
m_llModelInfo = LLModelStore::globalInstance()->acquireModel();
|
|
emit loadedModelInfoChanged();
|
|
}
|
|
|
|
void LlamaCppModel::resetModel()
|
|
{
|
|
m_llModelInfo = {};
|
|
emit loadedModelInfoChanged();
|
|
}
|
|
|
|
void LlamaCppModel::modelChangeRequested(const ModelInfo &modelInfo)
|
|
{
|
|
m_shouldBeLoaded = true;
|
|
loadModel(modelInfo);
|
|
}
|
|
|
|
bool LlamaCppModel::handlePrompt(int32_t token)
|
|
{
|
|
// m_promptResponseTokens is related to last prompt/response not
|
|
// the entire context window which we can reset on regenerate prompt
|
|
#if defined(DEBUG)
|
|
qDebug() << "prompt process" << m_llmThread.objectName() << token;
|
|
#endif
|
|
++m_promptTokens;
|
|
++m_promptResponseTokens;
|
|
m_timer->start();
|
|
return !m_stopGenerating;
|
|
}
|
|
|
|
bool LlamaCppModel::handleResponse(int32_t token, const std::string &response)
|
|
{
|
|
#if defined(DEBUG)
|
|
printf("%s", response.c_str());
|
|
fflush(stdout);
|
|
#endif
|
|
|
|
// check for error
|
|
if (token < 0) {
|
|
m_response.append(response);
|
|
emit responseChanged(QString::fromStdString(remove_leading_whitespace(m_response)));
|
|
return false;
|
|
}
|
|
|
|
// m_promptResponseTokens is related to last prompt/response not
|
|
// the entire context window which we can reset on regenerate prompt
|
|
++m_promptResponseTokens;
|
|
m_timer->inc();
|
|
Q_ASSERT(!response.empty());
|
|
m_response.append(response);
|
|
emit responseChanged(QString::fromStdString(remove_leading_whitespace(m_response)));
|
|
return !m_stopGenerating;
|
|
}
|
|
|
|
bool LlamaCppModel::prompt(const QList<QString> &collectionList, const QString &prompt)
|
|
{
|
|
if (m_restoreStateFromText) {
|
|
Q_ASSERT(m_state.isEmpty());
|
|
processRestoreStateFromText();
|
|
}
|
|
|
|
if (!m_processedSystemPrompt)
|
|
processSystemPrompt();
|
|
const QString promptTemplate = MySettings::globalInstance()->modelPromptTemplate(m_modelInfo);
|
|
const int32_t n_predict = MySettings::globalInstance()->modelMaxLength(m_modelInfo);
|
|
const int32_t top_k = MySettings::globalInstance()->modelTopK(m_modelInfo);
|
|
const float top_p = MySettings::globalInstance()->modelTopP(m_modelInfo);
|
|
const float min_p = MySettings::globalInstance()->modelMinP(m_modelInfo);
|
|
const float temp = MySettings::globalInstance()->modelTemperature(m_modelInfo);
|
|
const int32_t n_batch = MySettings::globalInstance()->modelPromptBatchSize(m_modelInfo);
|
|
const float repeat_penalty = MySettings::globalInstance()->modelRepeatPenalty(m_modelInfo);
|
|
const int32_t repeat_penalty_tokens = MySettings::globalInstance()->modelRepeatPenaltyTokens(m_modelInfo);
|
|
return promptInternal(collectionList, prompt, promptTemplate, n_predict, top_k, top_p, min_p, temp, n_batch,
|
|
repeat_penalty, repeat_penalty_tokens);
|
|
}
|
|
|
|
bool LlamaCppModel::promptInternal(const QList<QString> &collectionList, const QString &prompt, const QString &promptTemplate,
|
|
int32_t n_predict, int32_t top_k, float top_p, float min_p, float temp, int32_t n_batch, float repeat_penalty,
|
|
int32_t repeat_penalty_tokens)
|
|
{
|
|
if (!isModelLoaded())
|
|
return false;
|
|
|
|
QList<ResultInfo> databaseResults;
|
|
const int retrievalSize = MySettings::globalInstance()->localDocsRetrievalSize();
|
|
if (!collectionList.isEmpty()) {
|
|
emit requestRetrieveFromDB(collectionList, prompt, retrievalSize, &databaseResults); // blocks
|
|
emit databaseResultsChanged(databaseResults);
|
|
}
|
|
|
|
// Augment the prompt template with the results if any
|
|
QString docsContext;
|
|
if (!databaseResults.isEmpty()) {
|
|
QStringList results;
|
|
for (const ResultInfo &info : databaseResults)
|
|
results << u"Collection: %1\nPath: %2\nExcerpt: %3"_s.arg(info.collection, info.path, info.text);
|
|
|
|
// FIXME(jared): use a Jinja prompt template instead of hardcoded Alpaca-style localdocs template
|
|
docsContext = u"### Context:\n%1\n\n"_s.arg(results.join("\n\n"));
|
|
}
|
|
|
|
int n_threads = MySettings::globalInstance()->threadCount();
|
|
|
|
m_stopGenerating = false;
|
|
auto promptFunc = std::bind(&LlamaCppModel::handlePrompt, this, std::placeholders::_1);
|
|
auto responseFunc = std::bind(&LlamaCppModel::handleResponse, this, std::placeholders::_1,
|
|
std::placeholders::_2);
|
|
emit promptProcessing();
|
|
m_ctx.n_predict = n_predict;
|
|
m_ctx.top_k = top_k;
|
|
m_ctx.top_p = top_p;
|
|
m_ctx.min_p = min_p;
|
|
m_ctx.temp = temp;
|
|
m_ctx.n_batch = n_batch;
|
|
m_ctx.repeat_penalty = repeat_penalty;
|
|
m_ctx.repeat_last_n = repeat_penalty_tokens;
|
|
|
|
if (auto *lcppmodel = dynamic_cast<LlamaCppBackend *>(m_llModelInfo.model.get()))
|
|
lcppmodel->setThreadCount(n_threads);
|
|
|
|
#if defined(DEBUG)
|
|
printf("%s", qPrintable(prompt));
|
|
fflush(stdout);
|
|
#endif
|
|
|
|
QElapsedTimer totalTime;
|
|
totalTime.start();
|
|
m_timer->start();
|
|
if (!docsContext.isEmpty()) {
|
|
auto old_n_predict = std::exchange(m_ctx.n_predict, 0); // decode localdocs context without a response
|
|
m_llModelInfo.model->prompt(docsContext.toStdString(), "%1", promptFunc, responseFunc,
|
|
/*allowContextShift*/ true, m_ctx);
|
|
m_ctx.n_predict = old_n_predict; // now we are ready for a response
|
|
}
|
|
m_llModelInfo.model->prompt(prompt.toStdString(), promptTemplate.toStdString(), promptFunc, responseFunc,
|
|
/*allowContextShift*/ true, m_ctx);
|
|
#if defined(DEBUG)
|
|
printf("\n");
|
|
fflush(stdout);
|
|
#endif
|
|
m_timer->stop();
|
|
qint64 elapsed = totalTime.elapsed();
|
|
std::string trimmed = trim_whitespace(m_response);
|
|
if (trimmed != m_response) {
|
|
m_response = trimmed;
|
|
emit responseChanged(QString::fromStdString(m_response));
|
|
}
|
|
|
|
SuggestionMode mode = MySettings::globalInstance()->suggestionMode();
|
|
if (mode == SuggestionMode::On || (!databaseResults.isEmpty() && mode == SuggestionMode::LocalDocsOnly))
|
|
generateQuestions(elapsed);
|
|
else
|
|
emit responseStopped(elapsed);
|
|
|
|
m_pristineLoadedState = false;
|
|
return true;
|
|
}
|
|
|
|
void LlamaCppModel::loadModelAsync(bool reload)
|
|
{
|
|
m_shouldBeLoaded = true; // atomic
|
|
emit requestLoadModel(reload);
|
|
}
|
|
|
|
void LlamaCppModel::releaseModelAsync(bool unload)
|
|
{
|
|
m_shouldBeLoaded = false; // atomic
|
|
emit requestReleaseModel(unload);
|
|
}
|
|
|
|
void LlamaCppModel::requestTrySwitchContext()
|
|
{
|
|
m_shouldBeLoaded = true; // atomic
|
|
emit trySwitchContextRequested(modelInfo());
|
|
}
|
|
|
|
void LlamaCppModel::loadModel(bool reload)
|
|
{
|
|
Q_ASSERT(m_shouldBeLoaded);
|
|
if (m_isServer)
|
|
return; // server managed models directly
|
|
|
|
if (reload)
|
|
releaseModel(/*unload*/ true);
|
|
else if (isModelLoaded())
|
|
return; // already loaded
|
|
|
|
#if defined(DEBUG_MODEL_LOADING)
|
|
qDebug() << "loadModel" << m_llmThread.objectName() << m_llModelInfo.model.get();
|
|
#endif
|
|
ModelInfo m = modelInfo();
|
|
if (m.name().isEmpty()) {
|
|
ModelInfo defaultModel = ModelList::globalInstance()->defaultModelInfo();
|
|
if (defaultModel.filename().isEmpty()) {
|
|
emit modelLoadingError(u"Could not find any model to load"_s);
|
|
return;
|
|
}
|
|
m = defaultModel;
|
|
}
|
|
loadModel(m);
|
|
}
|
|
|
|
void LlamaCppModel::releaseModel(bool unload)
|
|
{
|
|
if (!isModelLoaded() || m_isServer)
|
|
return;
|
|
|
|
if (unload && m_shouldBeLoaded) {
|
|
// reloading the model, don't show unloaded status
|
|
emit modelLoadingPercentageChanged(std::numeric_limits<float>::min()); // small positive value
|
|
} else {
|
|
emit modelLoadingPercentageChanged(0.0f);
|
|
}
|
|
|
|
if (!m_markedForDeletion)
|
|
saveState();
|
|
|
|
#if defined(DEBUG_MODEL_LOADING)
|
|
qDebug() << "unloadModel" << m_llmThread.objectName() << m_llModelInfo.model.get();
|
|
#endif
|
|
|
|
if (unload) {
|
|
m_llModelInfo.resetModel(this);
|
|
}
|
|
|
|
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
|
m_pristineLoadedState = false;
|
|
}
|
|
|
|
void LlamaCppModel::generateName()
|
|
{
|
|
Q_ASSERT(isModelLoaded());
|
|
if (!isModelLoaded())
|
|
return;
|
|
|
|
const QString chatNamePrompt = MySettings::globalInstance()->modelChatNamePrompt(m_modelInfo);
|
|
if (chatNamePrompt.trimmed().isEmpty()) {
|
|
qWarning() << "LlamaCppModel: not generating chat name because prompt is empty";
|
|
return;
|
|
}
|
|
|
|
auto promptTemplate = MySettings::globalInstance()->modelPromptTemplate(m_modelInfo);
|
|
auto promptFunc = std::bind(&LlamaCppModel::handleNamePrompt, this, std::placeholders::_1);
|
|
auto responseFunc = std::bind(&LlamaCppModel::handleNameResponse, this, std::placeholders::_1, std::placeholders::_2);
|
|
ModelBackend::PromptContext ctx = m_ctx;
|
|
m_llModelInfo.model->prompt(chatNamePrompt.toStdString(), promptTemplate.toStdString(),
|
|
promptFunc, responseFunc, /*allowContextShift*/ false, ctx);
|
|
std::string trimmed = trim_whitespace(m_nameResponse);
|
|
if (trimmed != m_nameResponse) {
|
|
m_nameResponse = trimmed;
|
|
emit generatedNameChanged(QString::fromStdString(m_nameResponse));
|
|
}
|
|
m_pristineLoadedState = false;
|
|
}
|
|
|
|
void LlamaCppModel::handleChatIdChanged(const QString &id)
|
|
{
|
|
m_llmThread.setObjectName(id);
|
|
}
|
|
|
|
bool LlamaCppModel::handleNamePrompt(int32_t token)
|
|
{
|
|
#if defined(DEBUG)
|
|
qDebug() << "name prompt" << m_llmThread.objectName() << token;
|
|
#endif
|
|
Q_UNUSED(token);
|
|
return !m_stopGenerating;
|
|
}
|
|
|
|
bool LlamaCppModel::handleNameResponse(int32_t token, const std::string &response)
|
|
{
|
|
#if defined(DEBUG)
|
|
qDebug() << "name response" << m_llmThread.objectName() << token << response;
|
|
#endif
|
|
Q_UNUSED(token);
|
|
|
|
m_nameResponse.append(response);
|
|
emit generatedNameChanged(QString::fromStdString(m_nameResponse));
|
|
QString gen = QString::fromStdString(m_nameResponse).simplified();
|
|
QStringList words = gen.split(' ', Qt::SkipEmptyParts);
|
|
return words.size() <= 3;
|
|
}
|
|
|
|
bool LlamaCppModel::handleQuestionPrompt(int32_t token)
|
|
{
|
|
#if defined(DEBUG)
|
|
qDebug() << "question prompt" << m_llmThread.objectName() << token;
|
|
#endif
|
|
Q_UNUSED(token);
|
|
return !m_stopGenerating;
|
|
}
|
|
|
|
bool LlamaCppModel::handleQuestionResponse(int32_t token, const std::string &response)
|
|
{
|
|
#if defined(DEBUG)
|
|
qDebug() << "question response" << m_llmThread.objectName() << token << response;
|
|
#endif
|
|
Q_UNUSED(token);
|
|
|
|
// add token to buffer
|
|
m_questionResponse.append(response);
|
|
|
|
// match whole question sentences
|
|
// FIXME: This only works with response by the model in english which is not ideal for a multi-language
|
|
// model.
|
|
static const QRegularExpression reQuestion(R"(\b(What|Where|How|Why|When|Who|Which|Whose|Whom)\b[^?]*\?)");
|
|
|
|
// extract all questions from response
|
|
int lastMatchEnd = -1;
|
|
for (const auto &match : reQuestion.globalMatch(m_questionResponse)) {
|
|
lastMatchEnd = match.capturedEnd();
|
|
emit generatedQuestionFinished(match.captured());
|
|
}
|
|
|
|
// remove processed input from buffer
|
|
if (lastMatchEnd != -1)
|
|
m_questionResponse.erase(m_questionResponse.cbegin(), m_questionResponse.cbegin() + lastMatchEnd);
|
|
|
|
return true;
|
|
}
|
|
|
|
void LlamaCppModel::generateQuestions(qint64 elapsed)
|
|
{
|
|
Q_ASSERT(isModelLoaded());
|
|
if (!isModelLoaded()) {
|
|
emit responseStopped(elapsed);
|
|
return;
|
|
}
|
|
|
|
const std::string suggestedFollowUpPrompt = MySettings::globalInstance()->modelSuggestedFollowUpPrompt(m_modelInfo).toStdString();
|
|
if (QString::fromStdString(suggestedFollowUpPrompt).trimmed().isEmpty()) {
|
|
emit responseStopped(elapsed);
|
|
return;
|
|
}
|
|
|
|
emit generatingQuestions();
|
|
m_questionResponse.clear();
|
|
auto promptTemplate = MySettings::globalInstance()->modelPromptTemplate(m_modelInfo);
|
|
auto promptFunc = std::bind(&LlamaCppModel::handleQuestionPrompt, this, std::placeholders::_1);
|
|
auto responseFunc = std::bind(&LlamaCppModel::handleQuestionResponse, this, std::placeholders::_1, std::placeholders::_2);
|
|
ModelBackend::PromptContext ctx = m_ctx;
|
|
QElapsedTimer totalTime;
|
|
totalTime.start();
|
|
m_llModelInfo.model->prompt(suggestedFollowUpPrompt, promptTemplate.toStdString(), promptFunc, responseFunc,
|
|
/*allowContextShift*/ false, ctx);
|
|
elapsed += totalTime.elapsed();
|
|
emit responseStopped(elapsed);
|
|
}
|
|
|
|
|
|
bool LlamaCppModel::handleSystemPrompt(int32_t token)
|
|
{
|
|
#if defined(DEBUG)
|
|
qDebug() << "system prompt" << m_llmThread.objectName() << token << m_stopGenerating;
|
|
#endif
|
|
Q_UNUSED(token);
|
|
return !m_stopGenerating;
|
|
}
|
|
|
|
bool LlamaCppModel::handleRestoreStateFromTextPrompt(int32_t token)
|
|
{
|
|
#if defined(DEBUG)
|
|
qDebug() << "restore state from text prompt" << m_llmThread.objectName() << token << m_stopGenerating;
|
|
#endif
|
|
Q_UNUSED(token);
|
|
return !m_stopGenerating;
|
|
}
|
|
|
|
// this function serialized the cached model state to disk.
|
|
// we want to also serialize n_ctx, and read it at load time.
|
|
bool LlamaCppModel::serialize(QDataStream &stream, int version, bool serializeKV)
|
|
{
|
|
if (version > 1) {
|
|
stream << m_llModelType;
|
|
switch (m_llModelType) {
|
|
case GPTJ_: stream << GPTJ_INTERNAL_STATE_VERSION; break;
|
|
case LLAMA_: stream << LLAMA_INTERNAL_STATE_VERSION; break;
|
|
default: Q_UNREACHABLE();
|
|
}
|
|
}
|
|
stream << response();
|
|
stream << generatedName();
|
|
stream << m_promptResponseTokens;
|
|
|
|
if (!serializeKV) {
|
|
#if defined(DEBUG)
|
|
qDebug() << "serialize" << m_llmThread.objectName() << m_state.size();
|
|
#endif
|
|
return stream.status() == QDataStream::Ok;
|
|
}
|
|
|
|
if (version <= 3) {
|
|
int responseLogits = 0;
|
|
stream << responseLogits;
|
|
}
|
|
stream << m_ctx.n_past;
|
|
if (version >= 7) {
|
|
stream << m_ctx.n_ctx;
|
|
}
|
|
stream << quint64(m_ctx.tokens.size());
|
|
stream.writeRawData(reinterpret_cast<const char*>(m_ctx.tokens.data()), m_ctx.tokens.size() * sizeof(int));
|
|
saveState();
|
|
QByteArray compressed = qCompress(m_state);
|
|
stream << compressed;
|
|
#if defined(DEBUG)
|
|
qDebug() << "serialize" << m_llmThread.objectName() << m_state.size();
|
|
#endif
|
|
return stream.status() == QDataStream::Ok;
|
|
}
|
|
|
|
bool LlamaCppModel::deserialize(QDataStream &stream, int version, bool deserializeKV, bool discardKV)
|
|
{
|
|
if (version > 1) {
|
|
int internalStateVersion;
|
|
stream >> m_llModelType;
|
|
stream >> internalStateVersion; // for future use
|
|
}
|
|
QString response;
|
|
stream >> response;
|
|
m_response = response.toStdString();
|
|
QString nameResponse;
|
|
stream >> nameResponse;
|
|
m_nameResponse = nameResponse.toStdString();
|
|
stream >> m_promptResponseTokens;
|
|
|
|
// If we do not deserialize the KV or it is discarded, then we need to restore the state from the
|
|
// text only. This will be a costly operation, but the chat has to be restored from the text archive
|
|
// alone.
|
|
if (!deserializeKV || discardKV) {
|
|
m_restoreStateFromText = true;
|
|
m_pristineLoadedState = true;
|
|
}
|
|
|
|
if (!deserializeKV) {
|
|
#if defined(DEBUG)
|
|
qDebug() << "deserialize" << m_llmThread.objectName();
|
|
#endif
|
|
return stream.status() == QDataStream::Ok;
|
|
}
|
|
|
|
if (version <= 3) {
|
|
int responseLogits;
|
|
stream >> responseLogits;
|
|
}
|
|
|
|
int32_t n_past;
|
|
stream >> n_past;
|
|
if (!discardKV) m_ctx.n_past = n_past;
|
|
|
|
if (version >= 7) {
|
|
uint32_t n_ctx;
|
|
stream >> n_ctx;
|
|
if (!discardKV) m_ctx.n_ctx = n_ctx;
|
|
}
|
|
|
|
if (version < 9) {
|
|
quint64 logitsSize;
|
|
stream >> logitsSize;
|
|
stream.skipRawData(logitsSize * sizeof(float));
|
|
}
|
|
|
|
quint64 tokensSize;
|
|
stream >> tokensSize;
|
|
if (!discardKV) {
|
|
m_ctx.tokens.resize(tokensSize);
|
|
stream.readRawData(reinterpret_cast<char*>(m_ctx.tokens.data()), tokensSize * sizeof(int));
|
|
} else {
|
|
stream.skipRawData(tokensSize * sizeof(int));
|
|
}
|
|
|
|
if (version > 0) {
|
|
QByteArray compressed;
|
|
stream >> compressed;
|
|
if (!discardKV)
|
|
m_state = qUncompress(compressed);
|
|
} else {
|
|
if (!discardKV) {
|
|
stream >> m_state;
|
|
} else {
|
|
QByteArray state;
|
|
stream >> state;
|
|
}
|
|
}
|
|
|
|
#if defined(DEBUG)
|
|
qDebug() << "deserialize" << m_llmThread.objectName();
|
|
#endif
|
|
return stream.status() == QDataStream::Ok;
|
|
}
|
|
|
|
void LlamaCppModel::saveState()
|
|
{
|
|
if (!isModelLoaded() || m_pristineLoadedState)
|
|
return;
|
|
|
|
if (m_llModelType == LLModelType::API_) {
|
|
m_state.clear();
|
|
QDataStream stream(&m_state, QIODeviceBase::WriteOnly);
|
|
stream.setVersion(QDataStream::Qt_6_4);
|
|
ChatAPI *chatAPI = static_cast<ChatAPI*>(m_llModelInfo.model.get());
|
|
stream << chatAPI->context();
|
|
return;
|
|
}
|
|
|
|
const size_t stateSize = m_llModelInfo.model->stateSize();
|
|
m_state.resize(stateSize);
|
|
#if defined(DEBUG)
|
|
qDebug() << "saveState" << m_llmThread.objectName() << "size:" << m_state.size();
|
|
#endif
|
|
m_llModelInfo.model->saveState(static_cast<uint8_t*>(reinterpret_cast<void*>(m_state.data())));
|
|
}
|
|
|
|
void LlamaCppModel::restoreState()
|
|
{
|
|
if (!isModelLoaded())
|
|
return;
|
|
|
|
if (m_llModelType == LLModelType::API_) {
|
|
QDataStream stream(&m_state, QIODeviceBase::ReadOnly);
|
|
stream.setVersion(QDataStream::Qt_6_4);
|
|
ChatAPI *chatAPI = static_cast<ChatAPI*>(m_llModelInfo.model.get());
|
|
QList<QString> context;
|
|
stream >> context;
|
|
chatAPI->setContext(context);
|
|
m_state.clear();
|
|
m_state.squeeze();
|
|
return;
|
|
}
|
|
|
|
#if defined(DEBUG)
|
|
qDebug() << "restoreState" << m_llmThread.objectName() << "size:" << m_state.size();
|
|
#endif
|
|
|
|
if (m_state.isEmpty())
|
|
return;
|
|
|
|
if (m_llModelInfo.model->stateSize() == m_state.size()) {
|
|
m_llModelInfo.model->restoreState(static_cast<const uint8_t*>(reinterpret_cast<void*>(m_state.data())));
|
|
m_processedSystemPrompt = true;
|
|
m_pristineLoadedState = true;
|
|
} else {
|
|
qWarning() << "restoring state from text because" << m_llModelInfo.model->stateSize() << "!=" << m_state.size();
|
|
m_restoreStateFromText = true;
|
|
}
|
|
|
|
// free local state copy unless unload is pending
|
|
if (m_shouldBeLoaded) {
|
|
m_state.clear();
|
|
m_state.squeeze();
|
|
m_pristineLoadedState = false;
|
|
}
|
|
}
|
|
|
|
void LlamaCppModel::processSystemPrompt()
|
|
{
|
|
Q_ASSERT(isModelLoaded());
|
|
if (!isModelLoaded() || m_processedSystemPrompt || m_restoreStateFromText || m_isServer)
|
|
return;
|
|
|
|
const std::string systemPrompt = MySettings::globalInstance()->modelSystemPrompt(m_modelInfo).toStdString();
|
|
if (QString::fromStdString(systemPrompt).trimmed().isEmpty()) {
|
|
m_processedSystemPrompt = true;
|
|
return;
|
|
}
|
|
|
|
// Start with a whole new context
|
|
m_stopGenerating = false;
|
|
m_ctx = ModelBackend::PromptContext();
|
|
|
|
auto promptFunc = std::bind(&LlamaCppModel::handleSystemPrompt, this, std::placeholders::_1);
|
|
|
|
const int32_t n_predict = MySettings::globalInstance()->modelMaxLength(m_modelInfo);
|
|
const int32_t top_k = MySettings::globalInstance()->modelTopK(m_modelInfo);
|
|
const float top_p = MySettings::globalInstance()->modelTopP(m_modelInfo);
|
|
const float min_p = MySettings::globalInstance()->modelMinP(m_modelInfo);
|
|
const float temp = MySettings::globalInstance()->modelTemperature(m_modelInfo);
|
|
const int32_t n_batch = MySettings::globalInstance()->modelPromptBatchSize(m_modelInfo);
|
|
const float repeat_penalty = MySettings::globalInstance()->modelRepeatPenalty(m_modelInfo);
|
|
const int32_t repeat_penalty_tokens = MySettings::globalInstance()->modelRepeatPenaltyTokens(m_modelInfo);
|
|
int n_threads = MySettings::globalInstance()->threadCount();
|
|
m_ctx.n_predict = n_predict;
|
|
m_ctx.top_k = top_k;
|
|
m_ctx.top_p = top_p;
|
|
m_ctx.min_p = min_p;
|
|
m_ctx.temp = temp;
|
|
m_ctx.n_batch = n_batch;
|
|
m_ctx.repeat_penalty = repeat_penalty;
|
|
m_ctx.repeat_last_n = repeat_penalty_tokens;
|
|
|
|
if (auto *lcppmodel = dynamic_cast<LlamaCppBackend *>(m_llModelInfo.model.get()))
|
|
lcppmodel->setThreadCount(n_threads);
|
|
|
|
#if defined(DEBUG)
|
|
printf("%s", qPrintable(QString::fromStdString(systemPrompt)));
|
|
fflush(stdout);
|
|
#endif
|
|
|
|
auto old_n_predict = std::exchange(m_ctx.n_predict, 0); // decode system prompt without a response
|
|
// use "%1%2" and not "%1" to avoid implicit whitespace
|
|
m_llModelInfo.model->prompt(systemPrompt, "%1%2", promptFunc, nullptr, /*allowContextShift*/ true, m_ctx, true);
|
|
m_ctx.n_predict = old_n_predict;
|
|
#if defined(DEBUG)
|
|
printf("\n");
|
|
fflush(stdout);
|
|
#endif
|
|
|
|
m_processedSystemPrompt = m_stopGenerating == false;
|
|
m_pristineLoadedState = false;
|
|
}
|
|
|
|
void LlamaCppModel::processRestoreStateFromText()
|
|
{
|
|
Q_ASSERT(isModelLoaded());
|
|
if (!isModelLoaded() || !m_restoreStateFromText || m_isServer)
|
|
return;
|
|
|
|
m_restoringFromText = true;
|
|
emit restoringFromTextChanged();
|
|
|
|
m_stopGenerating = false;
|
|
m_ctx = ModelBackend::PromptContext();
|
|
|
|
auto promptFunc = std::bind(&LlamaCppModel::handleRestoreStateFromTextPrompt, this, std::placeholders::_1);
|
|
|
|
const QString promptTemplate = MySettings::globalInstance()->modelPromptTemplate(m_modelInfo);
|
|
const int32_t n_predict = MySettings::globalInstance()->modelMaxLength(m_modelInfo);
|
|
const int32_t top_k = MySettings::globalInstance()->modelTopK(m_modelInfo);
|
|
const float top_p = MySettings::globalInstance()->modelTopP(m_modelInfo);
|
|
const float min_p = MySettings::globalInstance()->modelMinP(m_modelInfo);
|
|
const float temp = MySettings::globalInstance()->modelTemperature(m_modelInfo);
|
|
const int32_t n_batch = MySettings::globalInstance()->modelPromptBatchSize(m_modelInfo);
|
|
const float repeat_penalty = MySettings::globalInstance()->modelRepeatPenalty(m_modelInfo);
|
|
const int32_t repeat_penalty_tokens = MySettings::globalInstance()->modelRepeatPenaltyTokens(m_modelInfo);
|
|
int n_threads = MySettings::globalInstance()->threadCount();
|
|
m_ctx.n_predict = n_predict;
|
|
m_ctx.top_k = top_k;
|
|
m_ctx.top_p = top_p;
|
|
m_ctx.min_p = min_p;
|
|
m_ctx.temp = temp;
|
|
m_ctx.n_batch = n_batch;
|
|
m_ctx.repeat_penalty = repeat_penalty;
|
|
m_ctx.repeat_last_n = repeat_penalty_tokens;
|
|
|
|
if (auto *lcppmodel = dynamic_cast<LlamaCppBackend *>(m_llModelInfo.model.get()))
|
|
lcppmodel->setThreadCount(n_threads);
|
|
|
|
auto it = m_stateFromText.begin();
|
|
while (it < m_stateFromText.end()) {
|
|
auto &prompt = *it++;
|
|
Q_ASSERT(prompt.first == "Prompt: ");
|
|
Q_ASSERT(it < m_stateFromText.end());
|
|
|
|
auto &response = *it++;
|
|
Q_ASSERT(response.first != "Prompt: ");
|
|
auto responseText = response.second.toStdString();
|
|
|
|
m_llModelInfo.model->prompt(prompt.second.toStdString(), promptTemplate.toStdString(), promptFunc, nullptr,
|
|
/*allowContextShift*/ true, m_ctx, false, &responseText);
|
|
}
|
|
|
|
if (!m_stopGenerating) {
|
|
m_restoreStateFromText = false;
|
|
m_stateFromText.clear();
|
|
}
|
|
|
|
m_restoringFromText = false;
|
|
emit restoringFromTextChanged();
|
|
|
|
m_pristineLoadedState = false;
|
|
}
|