gpt4all/gpt4all-chat/src/chatllm.h

#ifndef CHATLLM_H
#define CHATLLM_H

#include "database.h" // IWYU pragma: keep
#include "modellist.h"

#include <gpt4all-backend/llmodel.h>

#include <QByteArray>
#include <QElapsedTimer>
#include <QFileInfo>
#include <QList>
#include <QObject>
#include <QPair>
#include <QString>
#include <QThread>
#include <QVariantMap>
#include <QVector>
#include <QtGlobal>

#include <atomic>
#include <cstdint>
#include <memory>
#include <optional>
#include <string>

using namespace Qt::Literals::StringLiterals;

class QDataStream;

// NOTE: values serialized to disk, do not change or reuse
enum LLModelType {
    GPTJ_  = 0, // no longer used
    LLAMA_ = 1,
    API_   = 2,
    BERT_  = 3, // no longer used
};

class ChatLLM;

struct LLModelInfo {
    std::unique_ptr<LLModel> model;
    QFileInfo fileInfo;
    std::optional<QString> fallbackReason;

    // NOTE: This does not store the model type or name on purpose as this is left for ChatLLM which
    // must be able to serialize the information even if it is in the unloaded state

    void resetModel(ChatLLM *cllm, LLModel *model = nullptr);
};

class TokenTimer : public QObject {
    Q_OBJECT
public:
    explicit TokenTimer(QObject *parent)
        : QObject(parent)
        , m_elapsed(0) {}

    static int rollingAverage(int oldAvg, int newNumber, int n)
    {
        // i.e. to calculate the new average after then nth number,
        // you multiply the old average by n−1, add the new number, and divide the total by n.
        return qRound(((float(oldAvg) * (n - 1)) + newNumber) / float(n));
    }

    void start() { m_tokens = 0; m_elapsed = 0; m_time.invalidate(); }
    void stop() { handleTimeout(); }
    void inc() {
        if (!m_time.isValid())
            m_time.start();
        ++m_tokens;
        if (m_time.elapsed() > 999)
            handleTimeout();
    }

Q_SIGNALS:
    void report(const QString &speed);

private Q_SLOTS:
    void handleTimeout()
    {
        m_elapsed += m_time.restart();
        emit report(u"%1 tokens/sec"_s.arg(m_tokens / float(m_elapsed / 1000.0f), 0, 'g', 2));
    }

private:
    QElapsedTimer m_time;
    qint64 m_elapsed;
    quint32 m_tokens;
};

class Chat;
class ChatLLM : public QObject
{
    Q_OBJECT
    Q_PROPERTY(bool restoringFromText READ restoringFromText NOTIFY restoringFromTextChanged)
    Q_PROPERTY(QString deviceBackend READ deviceBackend NOTIFY loadedModelInfoChanged)
    Q_PROPERTY(QString device READ device NOTIFY loadedModelInfoChanged)
    Q_PROPERTY(QString fallbackReason READ fallbackReason NOTIFY loadedModelInfoChanged)
public:
    ChatLLM(Chat *parent, bool isServer = false);
    virtual ~ChatLLM();

    void destroy();
    static void destroyStore();
    bool isModelLoaded() const;
    void regenerateResponse();
    void resetResponse();
    void resetContext();

    void stopGenerating() { m_stopGenerating = true; }

    bool shouldBeLoaded() const { return m_shouldBeLoaded; }
    void setShouldBeLoaded(bool b);
    void requestTrySwitchContext();
    void setForceUnloadModel(bool b) { m_forceUnloadModel = b; }
    void setMarkedForDeletion(bool b) { m_markedForDeletion = b; }

    QString response(bool trim = true) const;

    ModelInfo modelInfo() const;
    void setModelInfo(const ModelInfo &info);

    bool restoringFromText() const { return m_restoringFromText; }

    void acquireModel();
    void resetModel();

    QString deviceBackend() const
    {
        if (!isModelLoaded()) return QString();
        std::string name = LLModel::GPUDevice::backendIdToName(m_llModelInfo.model->backendName());
        return QString::fromStdString(name);
    }

    QString device() const
    {
        if (!isModelLoaded()) return QString();
        const char *name = m_llModelInfo.model->gpuDeviceName();
        return name ? QString(name) : u"CPU"_s;
    }

    // not loaded -> QString(), no fallback -> QString("")
    QString fallbackReason() const
    {
        if (!isModelLoaded()) return QString();
        return m_llModelInfo.fallbackReason.value_or(u""_s);
    }

    QString generatedName() const { return QString::fromStdString(m_nameResponse); }

    bool serialize(QDataStream &stream, int version, bool serializeKV);
    bool deserialize(QDataStream &stream, int version, bool deserializeKV, bool discardKV);
    void setStateFromText(const QVector<QPair<QString, QString>> &stateFromText) { m_stateFromText = stateFromText; }

public Q_SLOTS:
    bool prompt(const QList<QString> &collectionList, const QString &prompt);
    bool loadDefaultModel();
    void trySwitchContextOfLoadedModel(const ModelInfo &modelInfo);
    bool loadModel(const ModelInfo &modelInfo);
    void modelChangeRequested(const ModelInfo &modelInfo);
    void unloadModel();
    void reloadModel();
    void generateName();
    void generateQuestions(qint64 elapsed);
    void handleChatIdChanged(const QString &id);
    void handleShouldBeLoadedChanged();
    void handleThreadStarted();
    void handleForceMetalChanged(bool forceMetal);
    void handleDeviceChanged();
    void processSystemPrompt();
    void processRestoreStateFromText();

Q_SIGNALS:
    void restoringFromTextChanged();
    void loadedModelInfoChanged();
    void modelLoadingPercentageChanged(float);
    void modelLoadingError(const QString &error);
    void modelLoadingWarning(const QString &warning);
    void responseChanged(const QString &response);
    void promptProcessing();
    void generatingQuestions();
    void responseStopped(qint64 promptResponseMs);
    void generatedNameChanged(const QString &name);
    void generatedQuestionFinished(const QString &generatedQuestion);
    void stateChanged();
    void threadStarted();
    void shouldBeLoadedChanged();
    void trySwitchContextRequested(const ModelInfo &modelInfo);
    void trySwitchContextOfLoadedModelCompleted(int value);
    void requestRetrieveFromDB(const QList<QString> &collections, const QString &text, int retrievalSize, QList<ResultInfo> *results);
    void reportSpeed(const QString &speed);
    void reportDevice(const QString &device);
    void reportFallbackReason(const QString &fallbackReason);
    void databaseResultsChanged(const QList<ResultInfo>&);
    void modelInfoChanged(const ModelInfo &modelInfo);

protected:
    bool promptInternal(const QList<QString> &collectionList, const QString &prompt, const QString &promptTemplate,
        int32_t n_predict, int32_t top_k, float top_p, float min_p, float temp, int32_t n_batch, float repeat_penalty,
        int32_t repeat_penalty_tokens, std::optional<QString> fakeReply = {});
    bool handlePrompt(int32_t token);
    bool handleResponse(int32_t token, const std::string &response);
    bool handleNamePrompt(int32_t token);
    bool handleNameResponse(int32_t token, const std::string &response);
    bool handleSystemPrompt(int32_t token);
    bool handleSystemResponse(int32_t token, const std::string &response);
    bool handleRestoreStateFromTextPrompt(int32_t token);
    bool handleRestoreStateFromTextResponse(int32_t token, const std::string &response);
    bool handleQuestionPrompt(int32_t token);
    bool handleQuestionResponse(int32_t token, const std::string &response);
    void saveState();
    void restoreState();

protected:
    LLModel::PromptContext m_ctx;
    quint32 m_promptTokens;
    quint32 m_promptResponseTokens;

private:
    bool loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadProps);

    std::string m_response;
    std::string m_trimmedResponse;
    std::string m_nameResponse;
    QString m_questionResponse;
    LLModelInfo m_llModelInfo;
    LLModelType m_llModelType;
    ModelInfo m_modelInfo;
    TokenTimer *m_timer;
    QByteArray m_state;
    QThread m_llmThread;
    std::atomic<bool> m_stopGenerating;
    std::atomic<bool> m_shouldBeLoaded;
    std::atomic<bool> m_restoringFromText; // status indication
    std::atomic<bool> m_forceUnloadModel;
    std::atomic<bool> m_markedForDeletion;
    bool m_isServer;
    bool m_forceMetal;
    bool m_reloadingToChangeVariant;
    bool m_processedSystemPrompt;
    bool m_restoreStateFromText;
    // m_pristineLoadedState is set if saveSate is unnecessary, either because:
    // - an unload was queued during LLModel::restoreState()
    // - the chat will be restored from text and hasn't been interacted with yet
    bool m_pristineLoadedState = false;
    QVector<QPair<QString, QString>> m_stateFromText;
};

#endif // CHATLLM_H
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								#ifndef CHATLLM_H
 								#define CHATLLM_H
-												chat: don't use incomplete types with signals/slots/Q_INVOKABLE (#2408)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-06 11:59:28 -04:00
+								#include "database.h" // IWYU pragma: keep
-												chat: fix #includes with include-what-you-use (#2401)

Also use qGuiApp instead of qApp.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-04 14:47:11 -04:00
+								#include "modellist.h"
-												repo: organize sources, headers, and deps into subdirectories (#2917)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-08-27 17:22:40 -04:00
+								#include <gpt4all-backend/llmodel.h>
-												chat: fix #includes with include-what-you-use (#2401)

Also use qGuiApp instead of qApp.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-04 14:47:11 -04:00
 								#include <QByteArray>
 								#include <QElapsedTimer>
 								#include <QFileInfo>
-												chat: don't use incomplete types with signals/slots/Q_INVOKABLE (#2408)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-06 11:59:28 -04:00
+								#include <QList>
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								#include <QObject>
-												chat: fix #includes with include-what-you-use (#2401)

Also use qGuiApp instead of qApp.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-04 14:47:11 -04:00
+								#include <QPair>
 								#include <QString>
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								#include <QThread>
-												UI and embedding device changes for GPT4All v3.0.0-rc3 (#2477)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-28 12:57:57 -04:00
+								#include <QVariantMap>
-												chat: fix #includes with include-what-you-use (#2401)

Also use qGuiApp instead of qApp.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-04 14:47:11 -04:00
+								#include <QVector>
 								#include <QtGlobal>
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
-												chat: fix #includes with include-what-you-use (#2401)

Also use qGuiApp instead of qApp.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-04 14:47:11 -04:00
+								#include <atomic>
 								#include <cstdint>
-												chat: fix issues with quickly switching between multiple chats (#2343)

* prevent load progress from getting out of sync with the current chat
* fix memory leak on exit if the LLModelStore contains a model
* do not report cancellation as a failure in console/Mixpanel
* show "waiting for model" separately from "switching context" in UI
* do not show lower "reload" button on error
* skip context switch if unload is pending
* skip unnecessary calls to LLModel::saveState

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-05-15 14:07:03 -04:00
+								#include <memory>
-												chat: fix blank device in UI and improve Mixpanel reporting (#2409)

Also remove LLModel::hasGPUDevice.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-26 15:26:27 -04:00
+								#include <optional>
-												chat: fix #includes with include-what-you-use (#2401)

Also use qGuiApp instead of qApp.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-04 14:47:11 -04:00
+								#include <string>
-												chat: fix issues with quickly switching between multiple chats (#2343)

* prevent load progress from getting out of sync with the current chat
* fix memory leak on exit if the LLModelStore contains a model
* do not report cancellation as a failure in console/Mixpanel
* show "waiting for model" separately from "switching context" in UI
* do not show lower "reload" button on error
* skip context switch if unload is pending
* skip unnecessary calls to LLModel::saveState

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-05-15 14:07:03 -04:00
-												chat: major UI redesign for v3.0.0 (#2396)

Signed-off-by: Adam Treat <treat.adam@gmail.com>
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-24 18:49:23 -04:00
+								using namespace Qt::Literals::StringLiterals;
-												chat: fix #includes with include-what-you-use (#2401)

Also use qGuiApp instead of qApp.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-04 14:47:11 -04:00
+								class QDataStream;
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
-												chatllm: fix loading of chats after #2676 (#2693)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-07-18 21:03:18 -04:00
+								// NOTE: values serialized to disk, do not change or reuse
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								enum LLModelType {
-												chatllm: fix loading of chats after #2676 (#2693)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-07-18 21:03:18 -04:00
+								    GPTJ_  = 0, // no longer used
 								    LLAMA_ = 1,
 								    API_   = 2,
 								    BERT_  = 3, // no longer used
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								};
-												chat: fix blank device in UI and improve Mixpanel reporting (#2409)

Also remove LLModel::hasGPUDevice.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-26 15:26:27 -04:00
+								class ChatLLM;
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								struct LLModelInfo {
-												chat: fix issues with quickly switching between multiple chats (#2343)

* prevent load progress from getting out of sync with the current chat
* fix memory leak on exit if the LLModelStore contains a model
* do not report cancellation as a failure in console/Mixpanel
* show "waiting for model" separately from "switching context" in UI
* do not show lower "reload" button on error
* skip context switch if unload is pending
* skip unnecessary calls to LLModel::saveState

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-05-15 14:07:03 -04:00
+								    std::unique_ptr<LLModel> model;
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								    QFileInfo fileInfo;
-												chat: fix blank device in UI and improve Mixpanel reporting (#2409)

Also remove LLModel::hasGPUDevice.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-26 15:26:27 -04:00
+								    std::optional<QString> fallbackReason;
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								    // NOTE: This does not store the model type or name on purpose as this is left for ChatLLM which
 								    // must be able to serialize the information even if it is in the unloaded state
-												chat: fix blank device in UI and improve Mixpanel reporting (#2409)

Also remove LLModel::hasGPUDevice.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-26 15:26:27 -04:00
 								    void resetModel(ChatLLM *cllm, LLModel *model = nullptr);
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								};
-												Show token generation speed in gui. (#1020)


											
										
										
											2023-06-19 14:34:53 -04:00
+								class TokenTimer : public QObject {
 								    Q_OBJECT
 								public:
 								    explicit TokenTimer(QObject *parent)
 								        : QObject(parent)
 								        , m_elapsed(0) {}
 								    static int rollingAverage(int oldAvg, int newNumber, int n)
 								    {
 								        // i.e. to calculate the new average after then nth number,
 								        // you multiply the old average by n−1, add the new number, and divide the total by n.
 								        return qRound(((float(oldAvg) * (n - 1)) + newNumber) / float(n));
 								    }
 								    void start() { m_tokens = 0; m_elapsed = 0; m_time.invalidate(); }
 								    void stop() { handleTimeout(); }
 								    void inc() {
 								        if (!m_time.isValid())
 								            m_time.start();
 								        ++m_tokens;
 								        if (m_time.elapsed() > 999)
 								            handleTimeout();
 								    }
 								Q_SIGNALS:
 								    void report(const QString &speed);
 								private Q_SLOTS:
 								    void handleTimeout()
 								    {
 								        m_elapsed += m_time.restart();
-												chat: major UI redesign for v3.0.0 (#2396)

Signed-off-by: Adam Treat <treat.adam@gmail.com>
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-24 18:49:23 -04:00
+								        emit report(u"%1 tokens/sec"_s.arg(m_tokens / float(m_elapsed / 1000.0f), 0, 'g', 2));
-												Show token generation speed in gui. (#1020)


											
										
										
											2023-06-19 14:34:53 -04:00
+								    }
 								private:
 								    QElapsedTimer m_time;
 								    qint64 m_elapsed;
 								    quint32 m_tokens;
 								};
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 15:31:41 -04:00
+								class Chat;
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								class ChatLLM : public QObject
 								{
 								    Q_OBJECT
-												chat: faster KV shift, continue generating, fix stop sequences (#2781)

* Don't stop generating at end of context
* Use llama_kv_cache ops to shift context
* Fix and improve reverse prompt detection
* Replace prompt recalc callback with a flag to disallow context shift
											
										
										
											2024-08-07 11:25:24 -04:00
+								    Q_PROPERTY(bool restoringFromText READ restoringFromText NOTIFY restoringFromTextChanged)
-												chat: fix blank device in UI and improve Mixpanel reporting (#2409)

Also remove LLModel::hasGPUDevice.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-26 15:26:27 -04:00
+								    Q_PROPERTY(QString deviceBackend READ deviceBackend NOTIFY loadedModelInfoChanged)
 								    Q_PROPERTY(QString device READ device NOTIFY loadedModelInfoChanged)
 								    Q_PROPERTY(QString fallbackReason READ fallbackReason NOTIFY loadedModelInfoChanged)
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								public:
-												The server has different lifetime mgmt than the other chats.

											
										
										
											2023-05-13 19:33:19 -04:00
+								    ChatLLM(Chat *parent, bool isServer = false);
-												Cleanup the chatllm properly.

											
										
										
											2023-05-12 14:06:03 -04:00
+								    virtual ~ChatLLM();
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
-												chat: join ChatLLM threads without calling destructors (#2043)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-03-06 16:42:59 -05:00
+								    void destroy();
-												chat: fix issues with quickly switching between multiple chats (#2343)

* prevent load progress from getting out of sync with the current chat
* fix memory leak on exit if the LLModelStore contains a model
* do not report cancellation as a failure in console/Mixpanel
* show "waiting for model" separately from "switching context" in UI
* do not show lower "reload" button on error
* skip context switch if unload is pending
* skip unnecessary calls to LLModel::saveState

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-05-15 14:07:03 -04:00
+								    static void destroyStore();
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								    bool isModelLoaded() const;
 								    void regenerateResponse();
 								    void resetResponse();
 								    void resetContext();
 								    void stopGenerating() { m_stopGenerating = true; }
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								    bool shouldBeLoaded() const { return m_shouldBeLoaded; }
 								    void setShouldBeLoaded(bool b);
-												chat: fix issues with quickly switching between multiple chats (#2343)

* prevent load progress from getting out of sync with the current chat
* fix memory leak on exit if the LLModelStore contains a model
* do not report cancellation as a failure in console/Mixpanel
* show "waiting for model" separately from "switching context" in UI
* do not show lower "reload" button on error
* skip context switch if unload is pending
* skip unnecessary calls to LLModel::saveState

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-05-15 14:07:03 -04:00
+								    void requestTrySwitchContext();
-												Complete revamp of model loading to allow for more discreet control by
the user of the models loading behavior.

Signed-off-by: Adam Treat <treat.adam@gmail.com>

											
										
										
											2024-02-07 09:37:59 -05:00
+								    void setForceUnloadModel(bool b) { m_forceUnloadModel = b; }
-												Fix for issue #2080 where the GUI appears to hang when a chat with a large
model is deleted. There is no reason to save the context for a chat that
is being deleted.

Signed-off-by: Adam Treat <treat.adam@gmail.com>

											
										
										
											2024-03-06 12:59:34 -05:00
+								    void setMarkedForDeletion(bool b) { m_markedForDeletion = b; }
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
-												server: improve correctness of request parsing and responses (#2929)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-09-09 10:48:57 -04:00
+								    QString response(bool trim = true) const;
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
-												Modellist temp

											
										
										
											2023-06-22 15:44:49 -04:00
+								    ModelInfo modelInfo() const;
 								    void setModelInfo(const ModelInfo &info);
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
-												chat: faster KV shift, continue generating, fix stop sequences (#2781)

* Don't stop generating at end of context
* Use llama_kv_cache ops to shift context
* Fix and improve reverse prompt detection
* Replace prompt recalc callback with a flag to disallow context shift
											
										
										
											2024-08-07 11:25:24 -04:00
+								    bool restoringFromText() const { return m_restoringFromText; }
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
-												chat: fix blank device in UI and improve Mixpanel reporting (#2409)

Also remove LLModel::hasGPUDevice.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-26 15:26:27 -04:00
+								    void acquireModel();
 								    void resetModel();
 								    QString deviceBackend() const
 								    {
 								        if (!isModelLoaded()) return QString();
 								        std::string name = LLModel::GPUDevice::backendIdToName(m_llModelInfo.model->backendName());
 								        return QString::fromStdString(name);
 								    }
 								    QString device() const
 								    {
 								        if (!isModelLoaded()) return QString();
 								        const char *name = m_llModelInfo.model->gpuDeviceName();
 								        return name ? QString(name) : u"CPU"_s;
 								    }
 								    // not loaded -> QString(), no fallback -> QString("")
 								    QString fallbackReason() const
 								    {
 								        if (!isModelLoaded()) return QString();
 								        return m_llModelInfo.fallbackReason.value_or(u""_s);
 								    }
-												Generate names via llm.

											
										
										
											2023-05-02 11:19:17 -04:00
+								    QString generatedName() const { return QString::fromStdString(m_nameResponse); }
-												Restore state from text if necessary.

											
										
										
											2023-10-10 16:43:02 -04:00
+								    bool serialize(QDataStream &stream, int version, bool serializeKV);
 								    bool deserialize(QDataStream &stream, int version, bool deserializeKV, bool discardKV);
 								    void setStateFromText(const QVector<QPair<QString, QString>> &stateFromText) { m_stateFromText = stateFromText; }
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 15:31:41 -04:00
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								public Q_SLOTS:
-												Huge change that completely revamps the settings dialog and implements
per model settings as well as the ability to clone a model into a "character."
This also implements system prompts as well as quite a few bugfixes for
instance this fixes chatgpt.

											
										
										
											2023-07-01 11:34:21 -04:00
+								    bool prompt(const QList<QString> &collectionList, const QString &prompt);
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 15:31:41 -04:00
+								    bool loadDefaultModel();
-												chat: fix issues with quickly switching between multiple chats (#2343)

* prevent load progress from getting out of sync with the current chat
* fix memory leak on exit if the LLModelStore contains a model
* do not report cancellation as a failure in console/Mixpanel
* show "waiting for model" separately from "switching context" in UI
* do not show lower "reload" button on error
* skip context switch if unload is pending
* skip unnecessary calls to LLModel::saveState

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-05-15 14:07:03 -04:00
+								    void trySwitchContextOfLoadedModel(const ModelInfo &modelInfo);
-												Modellist temp

											
										
										
											2023-06-22 15:44:49 -04:00
+								    bool loadModel(const ModelInfo &modelInfo);
 								    void modelChangeRequested(const ModelInfo &modelInfo);
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 15:31:41 -04:00
+								    void unloadModel();
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								    void reloadModel();
-												Generate names via llm.

											
										
										
											2023-05-02 11:19:17 -04:00
+								    void generateName();
-												chat: generate follow-up questions after response (#2634)

* user can configure the prompt and when they appear
* also make the name generation prompt configurable

Signed-off-by: Adam Treat <treat.adam@gmail.com>
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-07-10 15:45:20 -04:00
+								    void generateQuestions(qint64 elapsed);
-												Start working on more thread safety and model load error handling.

											
										
										
											2023-06-19 19:51:28 -04:00
+								    void handleChatIdChanged(const QString &id);
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								    void handleShouldBeLoadedChanged();
-												Show token generation speed in gui. (#1020)


											
										
										
											2023-06-19 14:34:53 -04:00
+								    void handleThreadStarted();
-												Enable the force metal setting.

											
										
										
											2023-06-27 11:54:34 -04:00
+								    void handleForceMetalChanged(bool forceMetal);
-												Bring the vulkan backend to the GUI.

											
										
										
											2023-09-13 10:32:08 -04:00
+								    void handleDeviceChanged();
-												Huge change that completely revamps the settings dialog and implements
per model settings as well as the ability to clone a model into a "character."
This also implements system prompts as well as quite a few bugfixes for
instance this fixes chatgpt.

											
										
										
											2023-07-01 11:34:21 -04:00
+								    void processSystemPrompt();
-												Restore state from text if necessary.

											
										
										
											2023-10-10 16:43:02 -04:00
+								    void processRestoreStateFromText();
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
 								Q_SIGNALS:
-												chat: faster KV shift, continue generating, fix stop sequences (#2781)

* Don't stop generating at end of context
* Use llama_kv_cache ops to shift context
* Fix and improve reverse prompt detection
* Replace prompt recalc callback with a flag to disallow context shift
											
										
										
											2024-08-07 11:25:24 -04:00
+								    void restoringFromTextChanged();
-												chat: fix blank device in UI and improve Mixpanel reporting (#2409)

Also remove LLModel::hasGPUDevice.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-26 15:26:27 -04:00
+								    void loadedModelInfoChanged();
-												Complete revamp of model loading to allow for more discreet control by
the user of the models loading behavior.

Signed-off-by: Adam Treat <treat.adam@gmail.com>

											
										
										
											2024-02-07 09:37:59 -05:00
+								    void modelLoadingPercentageChanged(float);
-												Gracefully handle when we have a previous chat where the model that it used has gone away.

											
										
										
											2023-05-08 20:51:03 -04:00
+								    void modelLoadingError(const QString &error);
-												chat: implement display of model loading warnings (#2034)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-03-06 17:14:54 -05:00
+								    void modelLoadingWarning(const QString &warning);
-												Get rid of last blocking operations and make the chat/llm thread safe.

											
										
										
											2023-06-20 16:14:30 -04:00
+								    void responseChanged(const QString &response);
-												Add prompt processing and localdocs to the busy indicator in UI.

											
										
										
											2023-05-20 20:04:36 -04:00
+								    void promptProcessing();
-												chat: generate follow-up questions after response (#2634)

* user can configure the prompt and when they appear
* also make the name generation prompt configurable

Signed-off-by: Adam Treat <treat.adam@gmail.com>
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-07-10 15:45:20 -04:00
+								    void generatingQuestions();
-												improve mixpanel usage statistics (#2238)

Other changes:
- Always display first start dialog if privacy options are unset (e.g. if the user closed GPT4All without selecting them)
- LocalDocs scanQueue is now always deferred
- Fix a potential crash in magic_match
- LocalDocs indexing is now started after the first start dialog is dismissed so usage stats are included

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-04-25 13:16:52 -04:00
+								    void responseStopped(qint64 promptResponseMs);
-												Get rid of last blocking operations and make the chat/llm thread safe.

											
										
										
											2023-06-20 16:14:30 -04:00
+								    void generatedNameChanged(const QString &name);
-												chat: generate follow-up questions after response (#2634)

* user can configure the prompt and when they appear
* also make the name generation prompt configurable

Signed-off-by: Adam Treat <treat.adam@gmail.com>
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-07-10 15:45:20 -04:00
+								    void generatedQuestionFinished(const QString &generatedQuestion);
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 15:31:41 -04:00
+								    void stateChanged();
-												httpserver

											
										
										
											2023-05-11 16:46:25 -04:00
+								    void threadStarted();
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								    void shouldBeLoadedChanged();
-												chat: fix issues with quickly switching between multiple chats (#2343)

* prevent load progress from getting out of sync with the current chat
* fix memory leak on exit if the LLModelStore contains a model
* do not report cancellation as a failure in console/Mixpanel
* show "waiting for model" separately from "switching context" in UI
* do not show lower "reload" button on error
* skip context switch if unload is pending
* skip unnecessary calls to LLModel::saveState

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-05-15 14:07:03 -04:00
+								    void trySwitchContextRequested(const ModelInfo &modelInfo);
 								    void trySwitchContextOfLoadedModelCompleted(int value);
-												Make localdocs work with server mode.

											
										
										
											2023-06-01 14:13:12 -04:00
+								    void requestRetrieveFromDB(const QList<QString> &collections, const QString &text, int retrievalSize, QList<ResultInfo> *results);
-												Show token generation speed in gui. (#1020)


											
										
										
											2023-06-19 14:34:53 -04:00
+								    void reportSpeed(const QString &speed);
-												Report the actual device we're using.

											
										
										
											2023-09-14 08:25:37 -04:00
+								    void reportDevice(const QString &device);
-												chat: report reason for fallback to CPU

											
										
										
											2023-09-29 14:25:37 -04:00
+								    void reportFallbackReason(const QString &fallbackReason);
-												Don't store db results in ChatLLM.

											
										
										
											2023-06-19 18:23:54 -04:00
+								    void databaseResultsChanged(const QList<ResultInfo>&);
-												Modellist temp

											
										
										
											2023-06-22 15:44:49 -04:00
+								    void modelInfoChanged(const ModelInfo &modelInfo);
-												httpserver

											
										
										
											2023-05-11 16:46:25 -04:00
 								protected:
-												Huge change that completely revamps the settings dialog and implements
per model settings as well as the ability to clone a model into a "character."
This also implements system prompts as well as quite a few bugfixes for
instance this fixes chatgpt.

											
										
										
											2023-07-01 11:34:21 -04:00
+								    bool promptInternal(const QList<QString> &collectionList, const QString &prompt, const QString &promptTemplate,
-												add min_p sampling parameter (#2014)

Signed-off-by: Christopher Barrera <cb@arda.tx.rr.com>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
											
										
										
											2024-02-24 17:51:34 -05:00
+								        int32_t n_predict, int32_t top_k, float top_p, float min_p, float temp, int32_t n_batch, float repeat_penalty,
-												server: improve correctness of request parsing and responses (#2929)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-09-09 10:48:57 -04:00
+								        int32_t repeat_penalty_tokens, std::optional<QString> fakeReply = {});
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								    bool handlePrompt(int32_t token);
 								    bool handleResponse(int32_t token, const std::string &response);
-												Generate names via llm.

											
										
										
											2023-05-02 11:19:17 -04:00
+								    bool handleNamePrompt(int32_t token);
 								    bool handleNameResponse(int32_t token, const std::string &response);
-												Huge change that completely revamps the settings dialog and implements
per model settings as well as the ability to clone a model into a "character."
This also implements system prompts as well as quite a few bugfixes for
instance this fixes chatgpt.

											
										
										
											2023-07-01 11:34:21 -04:00
+								    bool handleSystemPrompt(int32_t token);
 								    bool handleSystemResponse(int32_t token, const std::string &response);
-												Restore state from text if necessary.

											
										
										
											2023-10-10 16:43:02 -04:00
+								    bool handleRestoreStateFromTextPrompt(int32_t token);
 								    bool handleRestoreStateFromTextResponse(int32_t token, const std::string &response);
-												chat: generate follow-up questions after response (#2634)

* user can configure the prompt and when they appear
* also make the name generation prompt configurable

Signed-off-by: Adam Treat <treat.adam@gmail.com>
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-07-10 15:45:20 -04:00
+								    bool handleQuestionPrompt(int32_t token);
 								    bool handleQuestionResponse(int32_t token, const std::string &response);
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 15:31:41 -04:00
+								    void saveState();
 								    void restoreState();
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
-												The server has different lifetime mgmt than the other chats.

											
										
										
											2023-05-13 19:33:19 -04:00
+								protected:
 								    LLModel::PromptContext m_ctx;
 								    quint32 m_promptTokens;
 								    quint32 m_promptResponseTokens;
-												Start working on more thread safety and model load error handling.

											
										
										
											2023-06-19 19:51:28 -04:00
 								private:
-												UI and embedding device changes for GPT4All v3.0.0-rc3 (#2477)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-06-28 12:57:57 -04:00
+								    bool loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadProps);
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								    std::string m_response;
-												server: improve correctness of request parsing and responses (#2929)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-09-09 10:48:57 -04:00
+								    std::string m_trimmedResponse;
-												Generate names via llm.

											
										
										
											2023-05-02 11:19:17 -04:00
+								    std::string m_nameResponse;
-												chat: generate follow-up questions after response (#2634)

* user can configure the prompt and when they appear
* also make the name generation prompt configurable

Signed-off-by: Adam Treat <treat.adam@gmail.com>
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-07-10 15:45:20 -04:00
+								    QString m_questionResponse;
-												Modellist temp

											
										
										
											2023-06-22 15:44:49 -04:00
+								    LLModelInfo m_llModelInfo;
 								    LLModelType m_llModelType;
 								    ModelInfo m_modelInfo;
-												Show token generation speed in gui. (#1020)


											
										
										
											2023-06-19 14:34:53 -04:00
+								    TokenTimer *m_timer;
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 15:31:41 -04:00
+								    QByteArray m_state;
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								    QThread m_llmThread;
 								    std::atomic<bool> m_stopGenerating;
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								    std::atomic<bool> m_shouldBeLoaded;
-												chat: faster KV shift, continue generating, fix stop sequences (#2781)

* Don't stop generating at end of context
* Use llama_kv_cache ops to shift context
* Fix and improve reverse prompt detection
* Replace prompt recalc callback with a flag to disallow context shift
											
										
										
											2024-08-07 11:25:24 -04:00
+								    std::atomic<bool> m_restoringFromText; // status indication
-												Complete revamp of model loading to allow for more discreet control by
the user of the models loading behavior.

Signed-off-by: Adam Treat <treat.adam@gmail.com>

											
										
										
											2024-02-07 09:37:59 -05:00
+								    std::atomic<bool> m_forceUnloadModel;
-												Fix for issue #2080 where the GUI appears to hang when a chat with a large
model is deleted. There is no reason to save the context for a chat that
is being deleted.

Signed-off-by: Adam Treat <treat.adam@gmail.com>

											
										
										
											2024-03-06 12:59:34 -05:00
+								    std::atomic<bool> m_markedForDeletion;
-												The server has different lifetime mgmt than the other chats.

											
										
										
											2023-05-13 19:33:19 -04:00
+								    bool m_isServer;
-												Enable the force metal setting.

											
										
										
											2023-06-27 11:54:34 -04:00
+								    bool m_forceMetal;
 								    bool m_reloadingToChangeVariant;
-												Huge change that completely revamps the settings dialog and implements
per model settings as well as the ability to clone a model into a "character."
This also implements system prompts as well as quite a few bugfixes for
instance this fixes chatgpt.

											
										
										
											2023-07-01 11:34:21 -04:00
+								    bool m_processedSystemPrompt;
-												Restore state from text if necessary.

											
										
										
											2023-10-10 16:43:02 -04:00
+								    bool m_restoreStateFromText;
-												chat: fix issues with quickly switching between multiple chats (#2343)

* prevent load progress from getting out of sync with the current chat
* fix memory leak on exit if the LLModelStore contains a model
* do not report cancellation as a failure in console/Mixpanel
* show "waiting for model" separately from "switching context" in UI
* do not show lower "reload" button on error
* skip context switch if unload is pending
* skip unnecessary calls to LLModel::saveState

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-05-15 14:07:03 -04:00
+								    // m_pristineLoadedState is set if saveSate is unnecessary, either because:
 								    // - an unload was queued during LLModel::restoreState()
 								    // - the chat will be restored from text and hasn't been interacted with yet
 								    bool m_pristineLoadedState = false;
-												Restore state from text if necessary.

											
										
										
											2023-10-10 16:43:02 -04:00
+								    QVector<QPair<QString, QString>> m_stateFromText;
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								};
 								#endif // CHATLLM_H