gpt4all/gpt4all-chat/chatllm.h

#ifndef CHATLLM_H
#define CHATLLM_H

#include <QObject>
#include <QThread>
#include <QFileInfo>

#include "localdocs.h"
#include "modellist.h"
#include "../gpt4all-backend/llmodel.h"

enum LLModelType {
    MPT_,
    GPTJ_,
    LLAMA_,
    CHATGPT_,
    BERT_,
};

struct LLModelInfo {
    LLModel *model = nullptr;
    QFileInfo fileInfo;
    // NOTE: This does not store the model type or name on purpose as this is left for ChatLLM which
    // must be able to serialize the information even if it is in the unloaded state
};

class TokenTimer : public QObject {
    Q_OBJECT
public:
    explicit TokenTimer(QObject *parent)
        : QObject(parent)
        , m_elapsed(0) {}

    static int rollingAverage(int oldAvg, int newNumber, int n)
    {
        // i.e. to calculate the new average after then nth number,
        // you multiply the old average by n−1, add the new number, and divide the total by n.
        return qRound(((float(oldAvg) * (n - 1)) + newNumber) / float(n));
    }

    void start() { m_tokens = 0; m_elapsed = 0; m_time.invalidate(); }
    void stop() { handleTimeout(); }
    void inc() {
        if (!m_time.isValid())
            m_time.start();
        ++m_tokens;
        if (m_time.elapsed() > 999)
            handleTimeout();
    }

Q_SIGNALS:
    void report(const QString &speed);

private Q_SLOTS:
    void handleTimeout()
    {
        m_elapsed += m_time.restart();
        emit report(QString("%1 tokens/sec").arg(m_tokens / float(m_elapsed / 1000.0f), 0, 'g', 2));
    }

private:
    QElapsedTimer m_time;
    qint64 m_elapsed;
    quint32 m_tokens;
};

class Chat;
class ChatLLM : public QObject
{
    Q_OBJECT
    Q_PROPERTY(bool isRecalc READ isRecalc NOTIFY recalcChanged)
public:
    ChatLLM(Chat *parent, bool isServer = false);
    virtual ~ChatLLM();

    bool isModelLoaded() const;
    void regenerateResponse();
    void resetResponse();
    void resetContext();

    void stopGenerating() { m_stopGenerating = true; }

    bool shouldBeLoaded() const { return m_shouldBeLoaded; }
    void setShouldBeLoaded(bool b);

    QString response() const;

    ModelInfo modelInfo() const;
    void setModelInfo(const ModelInfo &info);

    bool isRecalc() const { return m_isRecalc; }

    QString generatedName() const { return QString::fromStdString(m_nameResponse); }

    bool serialize(QDataStream &stream, int version);
    bool deserialize(QDataStream &stream, int version);

public Q_SLOTS:
    bool prompt(const QList<QString> &collectionList, const QString &prompt);
    bool loadDefaultModel();
    bool loadModel(const ModelInfo &modelInfo);
    void modelChangeRequested(const ModelInfo &modelInfo);
    void forceUnloadModel();
    void unloadModel();
    void reloadModel();
    void generateName();
    void handleChatIdChanged(const QString &id);
    void handleShouldBeLoadedChanged();
    void handleThreadStarted();
    void handleForceMetalChanged(bool forceMetal);
    void handleDeviceChanged();
    void processSystemPrompt();

Q_SIGNALS:
    void recalcChanged();
    void isModelLoadedChanged(bool);
    void modelLoadingError(const QString &error);
    void responseChanged(const QString &response);
    void promptProcessing();
    void responseStopped();
    void sendStartup();
    void sendModelLoaded();
    void generatedNameChanged(const QString &name);
    void stateChanged();
    void threadStarted();
    void shouldBeLoadedChanged();
    void requestRetrieveFromDB(const QList<QString> &collections, const QString &text, int retrievalSize, QList<ResultInfo> *results);
    void reportSpeed(const QString &speed);
    void reportDevice(const QString &device);
    void reportFallbackReason(const QString &fallbackReason);
    void databaseResultsChanged(const QList<ResultInfo>&);
    void modelInfoChanged(const ModelInfo &modelInfo);

protected:
    bool promptInternal(const QList<QString> &collectionList, const QString &prompt, const QString &promptTemplate,
        int32_t n_predict, int32_t top_k, float top_p, float temp, int32_t n_batch, float repeat_penalty,
        int32_t repeat_penalty_tokens);
    bool handlePrompt(int32_t token);
    bool handleResponse(int32_t token, const std::string &response);
    bool handleRecalculate(bool isRecalc);
    bool handleNamePrompt(int32_t token);
    bool handleNameResponse(int32_t token, const std::string &response);
    bool handleNameRecalculate(bool isRecalc);
    bool handleSystemPrompt(int32_t token);
    bool handleSystemResponse(int32_t token, const std::string &response);
    bool handleSystemRecalculate(bool isRecalc);
    void saveState();
    void restoreState();

protected:
    LLModel::PromptContext m_ctx;
    quint32 m_promptTokens;
    quint32 m_promptResponseTokens;

private:
    std::string m_response;
    std::string m_nameResponse;
    LLModelInfo m_llModelInfo;
    LLModelType m_llModelType;
    ModelInfo m_modelInfo;
    TokenTimer *m_timer;
    QByteArray m_state;
    QThread m_llmThread;
    std::atomic<bool> m_stopGenerating;
    std::atomic<bool> m_shouldBeLoaded;
    std::atomic<bool> m_isRecalc;
    bool m_isServer;
    bool m_forceMetal;
    bool m_reloadingToChangeVariant;
    bool m_processedSystemPrompt;
};

#endif // CHATLLM_H
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 13:10:05 +00:00
+								#ifndef CHATLLM_H
 								#define CHATLLM_H
 								#include <QObject>
 								#include <QThread>
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 23:05:35 +00:00
+								#include <QFileInfo>
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 13:10:05 +00:00
-												Make localdocs work with server mode.

											
										
										
											2023-06-01 18:13:12 +00:00
+								#include "localdocs.h"
-												Modellist temp

											
										
										
											2023-06-22 19:44:49 +00:00
+								#include "modellist.h"
-												Move the llmodel C API to new top-level directory and version it.

											
										
										
											2023-05-10 15:46:40 +00:00
+								#include "../gpt4all-backend/llmodel.h"
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 13:10:05 +00:00
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 23:05:35 +00:00
+								enum LLModelType {
 								    MPT_,
 								    GPTJ_,
-												Preliminary support for chatgpt models.

											
										
										
											2023-05-15 00:12:15 +00:00
+								    LLAMA_,
 								    CHATGPT_,
-												Add starcoder support.

											
										
										
											2023-07-06 20:34:04 +00:00
+								    BERT_,
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 23:05:35 +00:00
+								};
 								struct LLModelInfo {
 								    LLModel *model = nullptr;
 								    QFileInfo fileInfo;
 								    // NOTE: This does not store the model type or name on purpose as this is left for ChatLLM which
 								    // must be able to serialize the information even if it is in the unloaded state
 								};
-												Show token generation speed in gui. (#1020)


											
										
										
											2023-06-19 18:34:53 +00:00
+								class TokenTimer : public QObject {
 								    Q_OBJECT
 								public:
 								    explicit TokenTimer(QObject *parent)
 								        : QObject(parent)
 								        , m_elapsed(0) {}
 								    static int rollingAverage(int oldAvg, int newNumber, int n)
 								    {
 								        // i.e. to calculate the new average after then nth number,
 								        // you multiply the old average by n−1, add the new number, and divide the total by n.
 								        return qRound(((float(oldAvg) * (n - 1)) + newNumber) / float(n));
 								    }
 								    void start() { m_tokens = 0; m_elapsed = 0; m_time.invalidate(); }
 								    void stop() { handleTimeout(); }
 								    void inc() {
 								        if (!m_time.isValid())
 								            m_time.start();
 								        ++m_tokens;
 								        if (m_time.elapsed() > 999)
 								            handleTimeout();
 								    }
 								Q_SIGNALS:
 								    void report(const QString &speed);
 								private Q_SLOTS:
 								    void handleTimeout()
 								    {
 								        m_elapsed += m_time.restart();
 								        emit report(QString("%1 tokens/sec").arg(m_tokens / float(m_elapsed / 1000.0f), 0, 'g', 2));
 								    }
 								private:
 								    QElapsedTimer m_time;
 								    qint64 m_elapsed;
 								    quint32 m_tokens;
 								};
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 19:31:41 +00:00
+								class Chat;
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 13:10:05 +00:00
+								class ChatLLM : public QObject
 								{
 								    Q_OBJECT
 								    Q_PROPERTY(bool isRecalc READ isRecalc NOTIFY recalcChanged)
 								public:
-												The server has different lifetime mgmt than the other chats.

											
										
										
											2023-05-13 23:33:19 +00:00
+								    ChatLLM(Chat *parent, bool isServer = false);
-												Cleanup the chatllm properly.

											
										
										
											2023-05-12 18:06:03 +00:00
+								    virtual ~ChatLLM();
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 13:10:05 +00:00
 								    bool isModelLoaded() const;
 								    void regenerateResponse();
 								    void resetResponse();
 								    void resetContext();
 								    void stopGenerating() { m_stopGenerating = true; }
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 23:05:35 +00:00
+								    bool shouldBeLoaded() const { return m_shouldBeLoaded; }
 								    void setShouldBeLoaded(bool b);
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 13:10:05 +00:00
+								    QString response() const;
-												Modellist temp

											
										
										
											2023-06-22 19:44:49 +00:00
+								    ModelInfo modelInfo() const;
 								    void setModelInfo(const ModelInfo &info);
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 13:10:05 +00:00
 								    bool isRecalc() const { return m_isRecalc; }
-												Generate names via llm.

											
										
										
											2023-05-02 15:19:17 +00:00
+								    QString generatedName() const { return QString::fromStdString(m_nameResponse); }
-												Convert the old format properly.

											
										
										
											2023-05-08 09:52:57 +00:00
+								    bool serialize(QDataStream &stream, int version);
 								    bool deserialize(QDataStream &stream, int version);
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 19:31:41 +00:00
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 13:10:05 +00:00
+								public Q_SLOTS:
-												Huge change that completely revamps the settings dialog and implements
per model settings as well as the ability to clone a model into a "character."
This also implements system prompts as well as quite a few bugfixes for
instance this fixes chatgpt.

											
										
										
											2023-07-01 15:34:21 +00:00
+								    bool prompt(const QList<QString> &collectionList, const QString &prompt);
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 19:31:41 +00:00
+								    bool loadDefaultModel();
-												Modellist temp

											
										
										
											2023-06-22 19:44:49 +00:00
+								    bool loadModel(const ModelInfo &modelInfo);
 								    void modelChangeRequested(const ModelInfo &modelInfo);
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 23:05:35 +00:00
+								    void forceUnloadModel();
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 19:31:41 +00:00
+								    void unloadModel();
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 23:05:35 +00:00
+								    void reloadModel();
-												Generate names via llm.

											
										
										
											2023-05-02 15:19:17 +00:00
+								    void generateName();
-												Start working on more thread safety and model load error handling.

											
										
										
											2023-06-19 23:51:28 +00:00
+								    void handleChatIdChanged(const QString &id);
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 23:05:35 +00:00
+								    void handleShouldBeLoadedChanged();
-												Show token generation speed in gui. (#1020)


											
										
										
											2023-06-19 18:34:53 +00:00
+								    void handleThreadStarted();
-												Enable the force metal setting.

											
										
										
											2023-06-27 15:54:34 +00:00
+								    void handleForceMetalChanged(bool forceMetal);
-												Bring the vulkan backend to the GUI.

											
										
										
											2023-09-13 14:32:08 +00:00
+								    void handleDeviceChanged();
-												Huge change that completely revamps the settings dialog and implements
per model settings as well as the ability to clone a model into a "character."
This also implements system prompts as well as quite a few bugfixes for
instance this fixes chatgpt.

											
										
										
											2023-07-01 15:34:21 +00:00
+								    void processSystemPrompt();
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 13:10:05 +00:00
 								Q_SIGNALS:
-												Modellist temp

											
										
										
											2023-06-22 19:44:49 +00:00
+								    void recalcChanged();
-												Get rid of last blocking operations and make the chat/llm thread safe.

											
										
										
											2023-06-20 20:14:30 +00:00
+								    void isModelLoadedChanged(bool);
-												Gracefully handle when we have a previous chat where the model that it used has gone away.

											
										
										
											2023-05-09 00:51:03 +00:00
+								    void modelLoadingError(const QString &error);
-												Get rid of last blocking operations and make the chat/llm thread safe.

											
										
										
											2023-06-20 20:14:30 +00:00
+								    void responseChanged(const QString &response);
-												Add prompt processing and localdocs to the busy indicator in UI.

											
										
										
											2023-05-21 00:04:36 +00:00
+								    void promptProcessing();
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 13:10:05 +00:00
+								    void responseStopped();
 								    void sendStartup();
 								    void sendModelLoaded();
-												Get rid of last blocking operations and make the chat/llm thread safe.

											
										
										
											2023-06-20 20:14:30 +00:00
+								    void generatedNameChanged(const QString &name);
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 19:31:41 +00:00
+								    void stateChanged();
-												httpserver

											
										
										
											2023-05-11 20:46:25 +00:00
+								    void threadStarted();
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 23:05:35 +00:00
+								    void shouldBeLoadedChanged();
-												Make localdocs work with server mode.

											
										
										
											2023-06-01 18:13:12 +00:00
+								    void requestRetrieveFromDB(const QList<QString> &collections, const QString &text, int retrievalSize, QList<ResultInfo> *results);
-												Show token generation speed in gui. (#1020)


											
										
										
											2023-06-19 18:34:53 +00:00
+								    void reportSpeed(const QString &speed);
-												Report the actual device we're using.

											
										
										
											2023-09-14 12:25:37 +00:00
+								    void reportDevice(const QString &device);
-												chat: report reason for fallback to CPU

											
										
										
											2023-09-29 18:25:37 +00:00
+								    void reportFallbackReason(const QString &fallbackReason);
-												Don't store db results in ChatLLM.

											
										
										
											2023-06-19 22:23:54 +00:00
+								    void databaseResultsChanged(const QList<ResultInfo>&);
-												Modellist temp

											
										
										
											2023-06-22 19:44:49 +00:00
+								    void modelInfoChanged(const ModelInfo &modelInfo);
-												httpserver

											
										
										
											2023-05-11 20:46:25 +00:00
 								protected:
-												Huge change that completely revamps the settings dialog and implements
per model settings as well as the ability to clone a model into a "character."
This also implements system prompts as well as quite a few bugfixes for
instance this fixes chatgpt.

											
										
										
											2023-07-01 15:34:21 +00:00
+								    bool promptInternal(const QList<QString> &collectionList, const QString &prompt, const QString &promptTemplate,
 								        int32_t n_predict, int32_t top_k, float top_p, float temp, int32_t n_batch, float repeat_penalty,
 								        int32_t repeat_penalty_tokens);
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 13:10:05 +00:00
+								    bool handlePrompt(int32_t token);
 								    bool handleResponse(int32_t token, const std::string &response);
 								    bool handleRecalculate(bool isRecalc);
-												Generate names via llm.

											
										
										
											2023-05-02 15:19:17 +00:00
+								    bool handleNamePrompt(int32_t token);
 								    bool handleNameResponse(int32_t token, const std::string &response);
 								    bool handleNameRecalculate(bool isRecalc);
-												Huge change that completely revamps the settings dialog and implements
per model settings as well as the ability to clone a model into a "character."
This also implements system prompts as well as quite a few bugfixes for
instance this fixes chatgpt.

											
										
										
											2023-07-01 15:34:21 +00:00
+								    bool handleSystemPrompt(int32_t token);
 								    bool handleSystemResponse(int32_t token, const std::string &response);
 								    bool handleSystemRecalculate(bool isRecalc);
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 19:31:41 +00:00
+								    void saveState();
 								    void restoreState();
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 13:10:05 +00:00
-												The server has different lifetime mgmt than the other chats.

											
										
										
											2023-05-13 23:33:19 +00:00
+								protected:
 								    LLModel::PromptContext m_ctx;
 								    quint32 m_promptTokens;
 								    quint32 m_promptResponseTokens;
-												Start working on more thread safety and model load error handling.

											
										
										
											2023-06-19 23:51:28 +00:00
 								private:
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 13:10:05 +00:00
+								    std::string m_response;
-												Generate names via llm.

											
										
										
											2023-05-02 15:19:17 +00:00
+								    std::string m_nameResponse;
-												Modellist temp

											
										
										
											2023-06-22 19:44:49 +00:00
+								    LLModelInfo m_llModelInfo;
 								    LLModelType m_llModelType;
 								    ModelInfo m_modelInfo;
-												Show token generation speed in gui. (#1020)


											
										
										
											2023-06-19 18:34:53 +00:00
+								    TokenTimer *m_timer;
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 19:31:41 +00:00
+								    QByteArray m_state;
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 13:10:05 +00:00
+								    QThread m_llmThread;
 								    std::atomic<bool> m_stopGenerating;
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 23:05:35 +00:00
+								    std::atomic<bool> m_shouldBeLoaded;
-												Make this atomic.

											
										
										
											2023-06-19 22:24:11 +00:00
+								    std::atomic<bool> m_isRecalc;
-												The server has different lifetime mgmt than the other chats.

											
										
										
											2023-05-13 23:33:19 +00:00
+								    bool m_isServer;
-												Enable the force metal setting.

											
										
										
											2023-06-27 15:54:34 +00:00
+								    bool m_forceMetal;
 								    bool m_reloadingToChangeVariant;
-												Huge change that completely revamps the settings dialog and implements
per model settings as well as the ability to clone a model into a "character."
This also implements system prompts as well as quite a few bugfixes for
instance this fixes chatgpt.

											
										
										
											2023-07-01 15:34:21 +00:00
+								    bool m_processedSystemPrompt;
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 13:10:05 +00:00
+								};
 								#endif // CHATLLM_H