gpt4all/gpt4all-chat/chatllm.h

#ifndef CHATLLM_H
#define CHATLLM_H

#include <QObject>
#include <QThread>
#include <QFileInfo>

#include "localdocs.h"
#include "../gpt4all-backend/llmodel.h"

enum LLModelType {
    MPT_,
    GPTJ_,
    LLAMA_,
    CHATGPT_,
    REPLIT_
};

struct LLModelInfo {
    LLModel *model = nullptr;
    QFileInfo fileInfo;
    // NOTE: This does not store the model type or name on purpose as this is left for ChatLLM which
    // must be able to serialize the information even if it is in the unloaded state
};

class TokenTimer : public QObject {
    Q_OBJECT
public:
    explicit TokenTimer(QObject *parent)
        : QObject(parent)
        , m_elapsed(0) {}

    static int rollingAverage(int oldAvg, int newNumber, int n)
    {
        // i.e. to calculate the new average after then nth number,
        // you multiply the old average by n−1, add the new number, and divide the total by n.
        return qRound(((float(oldAvg) * (n - 1)) + newNumber) / float(n));
    }

    void start() { m_tokens = 0; m_elapsed = 0; m_time.invalidate(); }
    void stop() { handleTimeout(); }
    void inc() {
        if (!m_time.isValid())
            m_time.start();
        ++m_tokens;
        if (m_time.elapsed() > 999)
            handleTimeout();
    }

Q_SIGNALS:
    void report(const QString &speed);

private Q_SLOTS:
    void handleTimeout()
    {
        m_elapsed += m_time.restart();
        emit report(QString("%1 tokens/sec").arg(m_tokens / float(m_elapsed / 1000.0f), 0, 'g', 2));
    }

private:
    QElapsedTimer m_time;
    qint64 m_elapsed;
    quint32 m_tokens;
};

class Chat;
class ChatLLM : public QObject
{
    Q_OBJECT
    Q_PROPERTY(bool isModelLoaded READ isModelLoaded NOTIFY isModelLoadedChanged)
    Q_PROPERTY(QString response READ response NOTIFY responseChanged)
    Q_PROPERTY(QString modelName READ modelName WRITE setModelName NOTIFY modelNameChanged)
    Q_PROPERTY(bool isRecalc READ isRecalc NOTIFY recalcChanged)
    Q_PROPERTY(QString generatedName READ generatedName NOTIFY generatedNameChanged)

public:
    ChatLLM(Chat *parent, bool isServer = false);
    virtual ~ChatLLM();

    bool isModelLoaded() const;
    void regenerateResponse();
    void resetResponse();
    void resetContext();

    void stopGenerating() { m_stopGenerating = true; }

    bool shouldBeLoaded() const { return m_shouldBeLoaded; }
    void setShouldBeLoaded(bool b);

    QString response() const;
    QString modelName() const;

    void setModelName(const QString &modelName);

    bool isRecalc() const { return m_isRecalc; }

    QString generatedName() const { return QString::fromStdString(m_nameResponse); }

    bool serialize(QDataStream &stream, int version);
    bool deserialize(QDataStream &stream, int version);

public Q_SLOTS:
    bool prompt(const QList<QString> &collectionList, const QString &prompt, const QString &prompt_template,
        int32_t n_predict, int32_t top_k, float top_p, float temp, int32_t n_batch, float repeat_penalty,
        int32_t repeat_penalty_tokens, int32_t n_threads);
    bool loadDefaultModel();
    bool loadModel(const QString &modelName);
    void modelNameChangeRequested(const QString &modelName);
    void forceUnloadModel();
    void unloadModel();
    void reloadModel();
    void generateName();
    void handleChatIdChanged(const QString &id);
    void handleDefaultModelChanged(const QString &defaultModel);
    void handleShouldBeLoadedChanged();
    void handleThreadStarted();

Q_SIGNALS:
    void isModelLoadedChanged(bool);
    void modelLoadingError(const QString &error);
    void responseChanged(const QString &response);
    void promptProcessing();
    void responseStopped();
    void modelNameChanged();
    void recalcChanged();
    void sendStartup();
    void sendModelLoaded();
    void generatedNameChanged(const QString &name);
    void stateChanged();
    void threadStarted();
    void shouldBeLoadedChanged();
    void requestRetrieveFromDB(const QList<QString> &collections, const QString &text, int retrievalSize, QList<ResultInfo> *results);
    void reportSpeed(const QString &speed);
    void databaseResultsChanged(const QList<ResultInfo>&);

protected:
    bool handlePrompt(int32_t token);
    bool handleResponse(int32_t token, const std::string &response);
    bool handleRecalculate(bool isRecalc);
    bool handleNamePrompt(int32_t token);
    bool handleNameResponse(int32_t token, const std::string &response);
    bool handleNameRecalculate(bool isRecalc);
    void saveState();
    void restoreState();

protected:
    LLModel::PromptContext m_ctx;
    quint32 m_promptTokens;
    quint32 m_promptResponseTokens;

private:
    std::string m_response;
    std::string m_nameResponse;
    LLModelInfo m_modelInfo;
    LLModelType m_modelType;
    QString m_modelName;
    QString m_defaultModel;
    TokenTimer *m_timer;
    QByteArray m_state;
    QThread m_llmThread;
    std::atomic<bool> m_stopGenerating;
    std::atomic<bool> m_shouldBeLoaded;
    std::atomic<bool> m_isRecalc;
    bool m_isServer;
};

#endif // CHATLLM_H
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								#ifndef CHATLLM_H
 								#define CHATLLM_H
 								#include <QObject>
 								#include <QThread>
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								#include <QFileInfo>
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
-												Make localdocs work with server mode.

											
										
										
											2023-06-01 14:13:12 -04:00
+								#include "localdocs.h"
-												Move the llmodel C API to new top-level directory and version it.

											
										
										
											2023-05-10 11:46:40 -04:00
+								#include "../gpt4all-backend/llmodel.h"
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								enum LLModelType {
 								    MPT_,
 								    GPTJ_,
-												Preliminary support for chatgpt models.

											
										
										
											2023-05-14 20:12:15 -04:00
+								    LLAMA_,
 								    CHATGPT_,
-												Replit Model (#713)

* porting over replit code model to gpt4all

* replaced memory with kv_self struct

* continuing debug

* welp it built but lot of sus things

* working model loading and somewhat working generate.. need to format response?

* revert back to semi working version

* finally got rid of weird formatting

* figured out problem is with python bindings - this is good to go for testing

* addressing PR feedback

* output refactor

* fixed prompt reponse collection

* cleanup

* addressing PR comments

* building replit backend with new ggmlver code

* chatllm replit and clean python files

* cleanup

* updated replit to match new llmodel api

* match llmodel api and change size_t to Token

* resolve PR comments

* replit model commit comment
											
										
										
											2023-06-06 17:09:00 -04:00
+								    REPLIT_
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								};
 								struct LLModelInfo {
 								    LLModel *model = nullptr;
 								    QFileInfo fileInfo;
 								    // NOTE: This does not store the model type or name on purpose as this is left for ChatLLM which
 								    // must be able to serialize the information even if it is in the unloaded state
 								};
-												Show token generation speed in gui. (#1020)


											
										
										
											2023-06-19 14:34:53 -04:00
+								class TokenTimer : public QObject {
 								    Q_OBJECT
 								public:
 								    explicit TokenTimer(QObject *parent)
 								        : QObject(parent)
 								        , m_elapsed(0) {}
 								    static int rollingAverage(int oldAvg, int newNumber, int n)
 								    {
 								        // i.e. to calculate the new average after then nth number,
 								        // you multiply the old average by n−1, add the new number, and divide the total by n.
 								        return qRound(((float(oldAvg) * (n - 1)) + newNumber) / float(n));
 								    }
 								    void start() { m_tokens = 0; m_elapsed = 0; m_time.invalidate(); }
 								    void stop() { handleTimeout(); }
 								    void inc() {
 								        if (!m_time.isValid())
 								            m_time.start();
 								        ++m_tokens;
 								        if (m_time.elapsed() > 999)
 								            handleTimeout();
 								    }
 								Q_SIGNALS:
 								    void report(const QString &speed);
 								private Q_SLOTS:
 								    void handleTimeout()
 								    {
 								        m_elapsed += m_time.restart();
 								        emit report(QString("%1 tokens/sec").arg(m_tokens / float(m_elapsed / 1000.0f), 0, 'g', 2));
 								    }
 								private:
 								    QElapsedTimer m_time;
 								    qint64 m_elapsed;
 								    quint32 m_tokens;
 								};
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 15:31:41 -04:00
+								class Chat;
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								class ChatLLM : public QObject
 								{
 								    Q_OBJECT
 								    Q_PROPERTY(bool isModelLoaded READ isModelLoaded NOTIFY isModelLoadedChanged)
 								    Q_PROPERTY(QString response READ response NOTIFY responseChanged)
 								    Q_PROPERTY(QString modelName READ modelName WRITE setModelName NOTIFY modelNameChanged)
 								    Q_PROPERTY(bool isRecalc READ isRecalc NOTIFY recalcChanged)
-												Generate names via llm.

											
										
										
											2023-05-02 11:19:17 -04:00
+								    Q_PROPERTY(QString generatedName READ generatedName NOTIFY generatedNameChanged)
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
 								public:
-												The server has different lifetime mgmt than the other chats.

											
										
										
											2023-05-13 19:33:19 -04:00
+								    ChatLLM(Chat *parent, bool isServer = false);
-												Cleanup the chatllm properly.

											
										
										
											2023-05-12 14:06:03 -04:00
+								    virtual ~ChatLLM();
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
 								    bool isModelLoaded() const;
 								    void regenerateResponse();
 								    void resetResponse();
 								    void resetContext();
 								    void stopGenerating() { m_stopGenerating = true; }
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								    bool shouldBeLoaded() const { return m_shouldBeLoaded; }
 								    void setShouldBeLoaded(bool b);
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								    QString response() const;
 								    QString modelName() const;
 								    void setModelName(const QString &modelName);
 								    bool isRecalc() const { return m_isRecalc; }
-												Generate names via llm.

											
										
										
											2023-05-02 11:19:17 -04:00
+								    QString generatedName() const { return QString::fromStdString(m_nameResponse); }
-												Convert the old format properly.

											
										
										
											2023-05-08 05:52:57 -04:00
+								    bool serialize(QDataStream &stream, int version);
 								    bool deserialize(QDataStream &stream, int version);
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 15:31:41 -04:00
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								public Q_SLOTS:
-												Start working on more thread safety and model load error handling.

											
										
										
											2023-06-19 19:51:28 -04:00
+								    bool prompt(const QList<QString> &collectionList, const QString &prompt, const QString &prompt_template,
 								        int32_t n_predict, int32_t top_k, float top_p, float temp, int32_t n_batch, float repeat_penalty,
 								        int32_t repeat_penalty_tokens, int32_t n_threads);
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 15:31:41 -04:00
+								    bool loadDefaultModel();
 								    bool loadModel(const QString &modelName);
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								    void modelNameChangeRequested(const QString &modelName);
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								    void forceUnloadModel();
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 15:31:41 -04:00
+								    void unloadModel();
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								    void reloadModel();
-												Generate names via llm.

											
										
										
											2023-05-02 11:19:17 -04:00
+								    void generateName();
-												Start working on more thread safety and model load error handling.

											
										
										
											2023-06-19 19:51:28 -04:00
+								    void handleChatIdChanged(const QString &id);
 								    void handleDefaultModelChanged(const QString &defaultModel);
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								    void handleShouldBeLoadedChanged();
-												Show token generation speed in gui. (#1020)


											
										
										
											2023-06-19 14:34:53 -04:00
+								    void handleThreadStarted();
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
 								Q_SIGNALS:
-												Get rid of last blocking operations and make the chat/llm thread safe.

											
										
										
											2023-06-20 16:14:30 -04:00
+								    void isModelLoadedChanged(bool);
-												Gracefully handle when we have a previous chat where the model that it used has gone away.

											
										
										
											2023-05-08 20:51:03 -04:00
+								    void modelLoadingError(const QString &error);
-												Get rid of last blocking operations and make the chat/llm thread safe.

											
										
										
											2023-06-20 16:14:30 -04:00
+								    void responseChanged(const QString &response);
-												Add prompt processing and localdocs to the busy indicator in UI.

											
										
										
											2023-05-20 20:04:36 -04:00
+								    void promptProcessing();
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								    void responseStopped();
 								    void modelNameChanged();
 								    void recalcChanged();
 								    void sendStartup();
 								    void sendModelLoaded();
-												Get rid of last blocking operations and make the chat/llm thread safe.

											
										
										
											2023-06-20 16:14:30 -04:00
+								    void generatedNameChanged(const QString &name);
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 15:31:41 -04:00
+								    void stateChanged();
-												httpserver

											
										
										
											2023-05-11 16:46:25 -04:00
+								    void threadStarted();
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								    void shouldBeLoadedChanged();
-												Make localdocs work with server mode.

											
										
										
											2023-06-01 14:13:12 -04:00
+								    void requestRetrieveFromDB(const QList<QString> &collections, const QString &text, int retrievalSize, QList<ResultInfo> *results);
-												Show token generation speed in gui. (#1020)


											
										
										
											2023-06-19 14:34:53 -04:00
+								    void reportSpeed(const QString &speed);
-												Don't store db results in ChatLLM.

											
										
										
											2023-06-19 18:23:54 -04:00
+								    void databaseResultsChanged(const QList<ResultInfo>&);
-												httpserver

											
										
										
											2023-05-11 16:46:25 -04:00
 								protected:
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								    bool handlePrompt(int32_t token);
 								    bool handleResponse(int32_t token, const std::string &response);
 								    bool handleRecalculate(bool isRecalc);
-												Generate names via llm.

											
										
										
											2023-05-02 11:19:17 -04:00
+								    bool handleNamePrompt(int32_t token);
 								    bool handleNameResponse(int32_t token, const std::string &response);
 								    bool handleNameRecalculate(bool isRecalc);
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 15:31:41 -04:00
+								    void saveState();
 								    void restoreState();
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
-												The server has different lifetime mgmt than the other chats.

											
										
										
											2023-05-13 19:33:19 -04:00
+								protected:
 								    LLModel::PromptContext m_ctx;
 								    quint32 m_promptTokens;
 								    quint32 m_promptResponseTokens;
-												Start working on more thread safety and model load error handling.

											
										
										
											2023-06-19 19:51:28 -04:00
 								private:
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								    std::string m_response;
-												Generate names via llm.

											
										
										
											2023-05-02 11:19:17 -04:00
+								    std::string m_nameResponse;
-												Start working on more thread safety and model load error handling.

											
										
										
											2023-06-19 19:51:28 -04:00
+								    LLModelInfo m_modelInfo;
 								    LLModelType m_modelType;
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								    QString m_modelName;
-												Start working on more thread safety and model load error handling.

											
										
										
											2023-06-19 19:51:28 -04:00
+								    QString m_defaultModel;
-												Show token generation speed in gui. (#1020)


											
										
										
											2023-06-19 14:34:53 -04:00
+								    TokenTimer *m_timer;
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 15:31:41 -04:00
+								    QByteArray m_state;
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								    QThread m_llmThread;
 								    std::atomic<bool> m_stopGenerating;
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-13 19:05:35 -04:00
+								    std::atomic<bool> m_shouldBeLoaded;
-												Make this atomic.

											
										
										
											2023-06-19 18:24:11 -04:00
+								    std::atomic<bool> m_isRecalc;
-												The server has different lifetime mgmt than the other chats.

											
										
										
											2023-05-13 19:33:19 -04:00
+								    bool m_isServer;
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 09:10:05 -04:00
+								};
 								#endif // CHATLLM_H