gpt4all/gpt4all-backend/llamamodel.cpp

#define LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
#include "llamamodel_impl.h"

#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <map>
#include <string>
#include <vector>
#include <iostream>
#if defined(_WIN32) && defined(_MSC_VER)
    #define WIN32_LEAN_AND_MEAN
    #ifndef NOMINMAX
        #define NOMINMAX
    #endif
    #include <windows.h>
    #include <io.h>
    #include <stdio.h>
#else
    #include <unistd.h>
#endif
#include <random>
#include <thread>
#include <unordered_set>

#include <llama.h>
#include <ggml.h>


namespace {
const char *modelType_ = "LLaMA";
}

struct gpt_params {
    int32_t seed          = -1;   // RNG seed
    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
#if LLAMA_DATE <= 230511
    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
#endif

#if LLAMA_DATE >= 230519
    // sampling parameters
    float   tfs_z         = 1.0f; // 1.0 = disabled
    float   typical_p     = 1.0f; // 1.0 = disabled
#endif

    std::string prompt = "";

    bool memory_f16        = true;  // use f16 instead of f32 for memory kv

    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
};

#if LLAMA_DATE >= 230519
static int llama_sample_top_p_top_k(
        llama_context *ctx,
        const llama_token *last_n_tokens_data,
        int last_n_tokens_size,
        int top_k,
        float top_p,
        float temp,
        float repeat_penalty) {
    auto logits = llama_get_logits(ctx);
    auto n_vocab = llama_n_vocab(ctx);
    // Populate initial list of all candidates
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    for (int token_id = 0; token_id < n_vocab; token_id++) {
        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
    }
    llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
    // Sample repeat penalty
    llama_sample_repetition_penalty(nullptr, &candidates_p, last_n_tokens_data, last_n_tokens_size, repeat_penalty);
    // Temperature sampling
    llama_sample_top_k(ctx, &candidates_p, top_k, 1);
    llama_sample_tail_free(ctx, &candidates_p, 1.0f, 1);
    llama_sample_typical(ctx, &candidates_p, 1.0f, 1);
    llama_sample_top_p(ctx, &candidates_p, top_p, 1);
    llama_sample_temperature(ctx, &candidates_p, temp);
    return llama_sample_token(ctx, &candidates_p);
}
#endif

struct LLamaPrivate {
    const std::string modelPath;
    bool modelLoaded;
    llama_context *ctx = nullptr;
    llama_context_params params;
    int64_t n_threads = 0;
};

LLamaModel::LLamaModel()
    : d_ptr(new LLamaPrivate) {
    d_ptr->modelLoaded = false;
}

// default hparams (LLaMA 7B)
struct llama_file_hparams {
    uint32_t n_vocab = 32000;
    uint32_t n_embd  = 4096;
    uint32_t n_mult  = 256;
    uint32_t n_head  = 32;
    uint32_t n_layer = 32;
    uint32_t n_rot   = 64;
    enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
};

size_t LLamaModel::requiredMem(const std::string &modelPath) {
    auto fin = std::ifstream(modelPath, std::ios::binary);
    fin.seekg(0, std::ios_base::end);
    size_t filesize = fin.tellg();
    fin.seekg(0, std::ios_base::beg);
    uint32_t magic = 0;
    fin.read(reinterpret_cast<char*>(&magic), sizeof(magic));
    if (magic != 0x67676a74) return 0;
    uint32_t version = 0;
    fin.read(reinterpret_cast<char*>(&version), sizeof(version));
    llama_file_hparams hparams;
    fin.read(reinterpret_cast<char*>(&hparams.n_vocab), sizeof(hparams.n_vocab));
    fin.read(reinterpret_cast<char*>(&hparams.n_embd), sizeof(hparams.n_embd));
    fin.read(reinterpret_cast<char*>(&hparams.n_head), sizeof(hparams.n_head));
    fin.read(reinterpret_cast<char*>(&hparams.n_layer), sizeof(hparams.n_layer));
    fin.read(reinterpret_cast<char*>(&hparams.n_rot), sizeof(hparams.n_rot));
    fin.read(reinterpret_cast<char*>(&hparams.ftype), sizeof(hparams.ftype));
    const size_t n_ctx = 2048;
    const size_t kvcache_element_size = 2; // fp16
    const size_t est_kvcache_size = hparams.n_embd * hparams.n_layer * 2u * n_ctx * kvcache_element_size;
    return filesize + est_kvcache_size;
}

bool LLamaModel::loadModel(const std::string &modelPath)
{
    // load the model
    d_ptr->params = llama_context_default_params();

    gpt_params params;
    d_ptr->params.n_ctx      = 2048;
    d_ptr->params.seed       = params.seed;
    d_ptr->params.f16_kv     = params.memory_f16;
    d_ptr->params.use_mmap   = params.use_mmap;
#if defined (__APPLE__)
    d_ptr->params.use_mlock  = true;
#else
    d_ptr->params.use_mlock  = params.use_mlock;
#endif
#if LLAMA_DATE <= 230511
    d_ptr->params.n_parts  = params.n_parts;
#endif
#ifdef GGML_USE_METAL
    std::cerr << "llama.cpp: using Metal" << std::endl;
    // metal always runs the whole model if n_gpu_layers is not 0, at least
    // currently
    d_ptr->params.n_gpu_layers = 1;
#endif

    d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params);
    if (!d_ptr->ctx) {
        std::cerr << "LLAMA ERROR: failed to load model from " <<  modelPath << std::endl;
        return false;
    }

    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
    d_ptr->modelLoaded = true;
    fflush(stderr);
    return true;
}

void LLamaModel::setThreadCount(int32_t n_threads) {
    d_ptr->n_threads = n_threads;
}

int32_t LLamaModel::threadCount() const {
    return d_ptr->n_threads;
}

LLamaModel::~LLamaModel()
{
    if(d_ptr->ctx) {
        llama_free(d_ptr->ctx);
    }
}

bool LLamaModel::isModelLoaded() const
{
    return d_ptr->modelLoaded;
}

size_t LLamaModel::stateSize() const
{
    return llama_get_state_size(d_ptr->ctx);
}

size_t LLamaModel::saveState(uint8_t *dest) const
{
    return llama_copy_state_data(d_ptr->ctx, dest);
}

size_t LLamaModel::restoreState(const uint8_t *src)
{
    // const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540
    return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
}

std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str) const
{
    const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos());
    std::vector<LLModel::Token> fres(str.size()+4);
    auto fres_len = llama_tokenize(d_ptr->ctx, str.c_str(), fres.data(), fres.size(), useBOS);
    fres.resize(fres_len);
    return fres;
}

std::string LLamaModel::tokenToString(Token id) const
{
    return llama_token_to_str(d_ptr->ctx, id);
}

LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
{
    const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
    return llama_sample_top_p_top_k(d_ptr->ctx,
        promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
        n_prev_toks, promptCtx.top_k, promptCtx.top_p, promptCtx.temp,
        promptCtx.repeat_penalty);
}

bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
{
    // When we recalculate context we could have erased the original BOS token... we need to replace it
    const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos());
    if (useBOS) {
        std::vector<int32_t> myTokens;
        myTokens.push_back(llama_token_bos());
        myTokens.insert(myTokens.end(), tokens.begin(), tokens.end());
        ctx.n_past += 1;
        return llama_eval(d_ptr->ctx, myTokens.data(), myTokens.size(), ctx.n_past, d_ptr->n_threads) == 0;
    } else
        return llama_eval(d_ptr->ctx, tokens.data(), tokens.size(), ctx.n_past, d_ptr->n_threads) == 0;
}

int32_t LLamaModel::contextLength() const
{
    return llama_n_ctx(d_ptr->ctx);
}

const std::vector<LLModel::Token> &LLamaModel::endTokens() const
{
    static const std::vector<LLModel::Token> fres = {llama_token_eos()};
    return fres;
}

#if defined(_WIN32)
#define DLL_EXPORT __declspec(dllexport)
#else
#define DLL_EXPORT __attribute__ ((visibility ("default")))
#endif

extern "C" {
DLL_EXPORT bool is_g4a_backend_model_implementation() {
    return true;
}

DLL_EXPORT const char *get_model_type() {
    return modelType_;
}

DLL_EXPORT const char *get_build_variant() {
    return GGML_BUILD_VARIANT;
}

DLL_EXPORT bool magic_match(std::istream& f) {
    // Check magic
    uint32_t magic = 0;
    f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
    if (magic != 0x67676a74) return false;
    // Check version
    uint32_t version = 0;
    f.read(reinterpret_cast<char*>(&version), sizeof(version));
    if (!(version LLAMA_VERSIONS)) {
        return false;
    }
    llama_file_hparams hparams;
    f.read(reinterpret_cast<char*>(&hparams), sizeof(hparams));
    if (!(hparams.n_vocab >= 32000 && hparams.n_vocab <= 32100)) {
        return false; // not a llama.
    }
#ifdef GGML_USE_METAL
    // Check quant supported on metal
    // skip fields
    switch(hparams.ftype) {
        // currently supported on Metal https://github.com/ggerganov/llama.cpp/blob/ae9663f1887513e152839e91f61c513075a19422/ggml-metal.m#L51-L55
        case LLAMA_FTYPE_MOSTLY_F16:
        case LLAMA_FTYPE_MOSTLY_Q2_K:
        case LLAMA_FTYPE_MOSTLY_Q4_0:
        case LLAMA_FTYPE_MOSTLY_Q6_K:
        case LLAMA_FTYPE_MOSTLY_Q4_K_S:
        case LLAMA_FTYPE_MOSTLY_Q4_K_M:
            return true;
        default: // unsupported quant-type for Metal
            return false;
    }
#endif
    return true;
}

DLL_EXPORT LLModel *construct() {
    return new LLamaModel;
}
}
Dlopen backend 5 (#779) Major change to the backend that allows for pluggable versions of llama.cpp/ggml. This was squashed merged from dlopen_backend_5 where the history is preserved. 2023-05-31 17:04:01 -04:00			`#define LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE`
			`#include "llamamodel_impl.h"`
Add llama.cpp support for loading llama based models in the gui. We now support loading both gptj derived models and llama derived models. 2023-04-15 15:57:32 -04:00
			`#include <cassert>`
			`#include <cmath>`
			`#include <cstdio>`
			`#include <cstring>`
			`#include <fstream>`
			`#include <map>`
			`#include <string>`
			`#include <vector>`
			`#include <iostream>`
gpt4all-backend: Add MSVC support to backend (#595) * Add MSVC compatibility * Add _MSC_VER macro --------- Co-authored-by: kuvaus <kuvaus@users.noreply.github.com> 2023-05-16 11:35:33 -04:00			`#if defined(_WIN32) && defined(_MSC_VER)`
			`#define WIN32_LEAN_AND_MEAN`
			`#ifndef NOMINMAX`
			`#define NOMINMAX`
			`#endif`
			`#include <windows.h>`
			`#include <io.h>`
			`#include <stdio.h>`
			`#else`
			`#include <unistd.h>`
			`#endif`
Add llama.cpp support for loading llama based models in the gui. We now support loading both gptj derived models and llama derived models. 2023-04-15 15:57:32 -04:00			`#include <random>`
			`#include <thread>`
Add reverse prompts for llama models. 2023-05-03 11:58:26 -04:00			`#include <unordered_set>`
Add llama.cpp support for loading llama based models in the gui. We now support loading both gptj derived models and llama derived models. 2023-04-15 15:57:32 -04:00
Dlopen backend 5 (#779) Major change to the backend that allows for pluggable versions of llama.cpp/ggml. This was squashed merged from dlopen_backend_5 where the history is preserved. 2023-05-31 17:04:01 -04:00			`#include <llama.h>`
			`#include <ggml.h>`


			`namespace {`
			`const char *modelType_ = "LLaMA";`
			`}`

			`struct gpt_params {`
			`int32_t seed = -1; // RNG seed`
			`int32_t n_keep = 0; // number of tokens to keep from initial prompt`
			`#if LLAMA_DATE <= 230511`
			`int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)`
			`#endif`

			`#if LLAMA_DATE >= 230519`
			`// sampling parameters`
			`float tfs_z = 1.0f; // 1.0 = disabled`
			`float typical_p = 1.0f; // 1.0 = disabled`
			`#endif`

			`std::string prompt = "";`

			`bool memory_f16 = true; // use f16 instead of f32 for memory kv`

			`bool use_mmap = true; // use mmap for faster loads`
			`bool use_mlock = false; // use mlock to keep model in memory`
			`};`

			`#if LLAMA_DATE >= 230519`
			`static int llama_sample_top_p_top_k(`
			`llama_context *ctx,`
			`const llama_token *last_n_tokens_data,`
			`int last_n_tokens_size,`
			`int top_k,`
			`float top_p,`
			`float temp,`
			`float repeat_penalty) {`
			`auto logits = llama_get_logits(ctx);`
			`auto n_vocab = llama_n_vocab(ctx);`
			`// Populate initial list of all candidates`
			`std::vector<llama_token_data> candidates;`
			`candidates.reserve(n_vocab);`
			`for (int token_id = 0; token_id < n_vocab; token_id++) {`
			`candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});`
			`}`
			`llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};`
			`// Sample repeat penalty`
			`llama_sample_repetition_penalty(nullptr, &candidates_p, last_n_tokens_data, last_n_tokens_size, repeat_penalty);`
			`// Temperature sampling`
			`llama_sample_top_k(ctx, &candidates_p, top_k, 1);`
			`llama_sample_tail_free(ctx, &candidates_p, 1.0f, 1);`
			`llama_sample_typical(ctx, &candidates_p, 1.0f, 1);`
			`llama_sample_top_p(ctx, &candidates_p, top_p, 1);`
			`llama_sample_temperature(ctx, &candidates_p, temp);`
			`return llama_sample_token(ctx, &candidates_p);`
			`}`
			`#endif`

Add llama.cpp support for loading llama based models in the gui. We now support loading both gptj derived models and llama derived models. 2023-04-15 15:57:32 -04:00			`struct LLamaPrivate {`
			`const std::string modelPath;`
			`bool modelLoaded;`
			`llama_context *ctx = nullptr;`
			`llama_context_params params;`
			`int64_t n_threads = 0;`
			`};`

			`LLamaModel::LLamaModel()`
			`: d_ptr(new LLamaPrivate) {`
			`d_ptr->modelLoaded = false;`
			`}`

add requiredMem method to llmodel impls most of these can just shortcut out of the model loading logic llama is a bit worse to deal with because we submodule it so I have to at least parse the hparams, and then I just use the size on disk as an estimate for the mem size (which seems reasonable since we mmap() the llama files anyway) 2023-06-26 15:17:34 -04:00			`// default hparams (LLaMA 7B)`
			`struct llama_file_hparams {`
			`uint32_t n_vocab = 32000;`
			`uint32_t n_embd = 4096;`
			`uint32_t n_mult = 256;`
			`uint32_t n_head = 32;`
			`uint32_t n_layer = 32;`
			`uint32_t n_rot = 64;`
			`enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;`
			`};`

			`size_t LLamaModel::requiredMem(const std::string &modelPath) {`
			`auto fin = std::ifstream(modelPath, std::ios::binary);`
			`fin.seekg(0, std::ios_base::end);`
			`size_t filesize = fin.tellg();`
			`fin.seekg(0, std::ios_base::beg);`
			`uint32_t magic = 0;`
			`fin.read(reinterpret_cast<char*>(&magic), sizeof(magic));`
			`if (magic != 0x67676a74) return 0;`
			`uint32_t version = 0;`
			`fin.read(reinterpret_cast<char*>(&version), sizeof(version));`
			`llama_file_hparams hparams;`
			`fin.read(reinterpret_cast<char*>(&hparams.n_vocab), sizeof(hparams.n_vocab));`
			`fin.read(reinterpret_cast<char*>(&hparams.n_embd), sizeof(hparams.n_embd));`
			`fin.read(reinterpret_cast<char*>(&hparams.n_head), sizeof(hparams.n_head));`
			`fin.read(reinterpret_cast<char*>(&hparams.n_layer), sizeof(hparams.n_layer));`
			`fin.read(reinterpret_cast<char*>(&hparams.n_rot), sizeof(hparams.n_rot));`
			`fin.read(reinterpret_cast<char*>(&hparams.ftype), sizeof(hparams.ftype));`
			`const size_t n_ctx = 2048;`
			`const size_t kvcache_element_size = 2; // fp16`
			`const size_t est_kvcache_size = hparams.n_embd * hparams.n_layer * 2u * n_ctx * kvcache_element_size;`
			`return filesize + est_kvcache_size;`
			`}`

Add llama.cpp support for loading llama based models in the gui. We now support loading both gptj derived models and llama derived models. 2023-04-15 15:57:32 -04:00			`bool LLamaModel::loadModel(const std::string &modelPath)`
			`{`
			`// load the model`
			`d_ptr->params = llama_context_default_params();`
Use default params unless we override them. 2023-04-20 12:07:43 -04:00
			`gpt_params params;`
Back out the prompt/response finding in gptj since it doesn't seem to help. Guard against reaching the end of the context window which we don't handle gracefully except for avoiding a crash. 2023-04-20 17:13:00 -04:00			`d_ptr->params.n_ctx = 2048;`
Use default params unless we override them. 2023-04-20 12:07:43 -04:00			`d_ptr->params.seed = params.seed;`
			`d_ptr->params.f16_kv = params.memory_f16;`
			`d_ptr->params.use_mmap = params.use_mmap;`
Only default mlock on macOS where swap seems to be a problem Repeating the change that once was done in https://github.com/nomic-ai/gpt4all/pull/663 but then was overriden by https://github.com/nomic-ai/gpt4all/commit/48275d0dcc0cbd23b7f6469e65b86112a6169dec Signed-off-by: Peter Gagarinov <pgagarinov@users.noreply.github.com> 2023-06-02 20:15:38 -04:00			`#if defined (__APPLE__)`
			`d_ptr->params.use_mlock = true;`
			`#else`
Only default mlock on macOS where swap seems to be a problem. 2023-05-21 10:13:35 -04:00			`d_ptr->params.use_mlock = params.use_mlock;`
Backend prompt dedup (#822) * Deduplicated prompt() function code 2023-06-04 08:59:24 -04:00			`#endif`
Dlopen backend 5 (#779) Major change to the backend that allows for pluggable versions of llama.cpp/ggml. This was squashed merged from dlopen_backend_5 where the history is preserved. 2023-05-31 17:04:01 -04:00			`#if LLAMA_DATE <= 230511`
			`d_ptr->params.n_parts = params.n_parts;`
Only default mlock on macOS where swap seems to be a problem. 2023-05-21 10:13:35 -04:00			`#endif`
Metal+LLama take two (#929) Support latest llama with Metal --------- Co-authored-by: Adam Treat <adam@nomic.ai> Co-authored-by: niansa/tuxifan <tuxifan@posteo.de> 2023-06-09 16:48:46 -04:00			`#ifdef GGML_USE_METAL`
			`std::cerr << "llama.cpp: using Metal" << std::endl;`
			`// metal always runs the whole model if n_gpu_layers is not 0, at least`
			`// currently`
			`d_ptr->params.n_gpu_layers = 1;`
			`#endif`
Use default params unless we override them. 2023-04-20 12:07:43 -04:00
Add llama.cpp support for loading llama based models in the gui. We now support loading both gptj derived models and llama derived models. 2023-04-15 15:57:32 -04:00			`d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params);`
			`if (!d_ptr->ctx) {`
			`std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;`
			`return false;`
			`}`

			`d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());`
			`d_ptr->modelLoaded = true;`
Infinite context window through trimming. 2023-04-25 11:20:51 -04:00			`fflush(stderr);`
Add llama.cpp support for loading llama based models in the gui. We now support loading both gptj derived models and llama derived models. 2023-04-15 15:57:32 -04:00			`return true;`
			`}`

			`void LLamaModel::setThreadCount(int32_t n_threads) {`
			`d_ptr->n_threads = n_threads;`
			`}`

Dlopen backend 5 (#779) Major change to the backend that allows for pluggable versions of llama.cpp/ggml. This was squashed merged from dlopen_backend_5 where the history is preserved. 2023-05-31 17:04:01 -04:00			`int32_t LLamaModel::threadCount() const {`
Add llama.cpp support for loading llama based models in the gui. We now support loading both gptj derived models and llama derived models. 2023-04-15 15:57:32 -04:00			`return d_ptr->n_threads;`
			`}`

			`LLamaModel::~LLamaModel()`
			`{`
llmodel: skip attempting Metal if model+kvcache > 53% of system ram 2023-06-26 17:53:17 -04:00			`if(d_ptr->ctx) {`
			`llama_free(d_ptr->ctx);`
			`}`
Add llama.cpp support for loading llama based models in the gui. We now support loading both gptj derived models and llama derived models. 2023-04-15 15:57:32 -04:00			`}`

			`bool LLamaModel::isModelLoaded() const`
			`{`
			`return d_ptr->modelLoaded;`
			`}`

First attempt at providing a persistent chat list experience. Limitations: 1) Context is not restored for gpt-j models 2) When you switch between different model types in an existing chat the context and all the conversation is lost 3) The settings are not chat or conversation specific 4) The sizes of the chat persisted files are very large due to how much data the llama.cpp backend tries to persist. Need to investigate how we can shrink this. 2023-05-04 15:31:41 -04:00			`size_t LLamaModel::stateSize() const`
			`{`
			`return llama_get_state_size(d_ptr->ctx);`
			`}`

			`size_t LLamaModel::saveState(uint8_t *dest) const`
			`{`
			`return llama_copy_state_data(d_ptr->ctx, dest);`
			`}`

			`size_t LLamaModel::restoreState(const uint8_t *src)`
			`{`
Dlopen backend 5 (#779) Major change to the backend that allows for pluggable versions of llama.cpp/ggml. This was squashed merged from dlopen_backend_5 where the history is preserved. 2023-05-31 17:04:01 -04:00			`// const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540`
			`return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));`
First attempt at providing a persistent chat list experience. Limitations: 1) Context is not restored for gpt-j models 2) When you switch between different model types in an existing chat the context and all the conversation is lost 3) The settings are not chat or conversation specific 4) The sizes of the chat persisted files are very large due to how much data the llama.cpp backend tries to persist. Need to investigate how we can shrink this. 2023-05-04 15:31:41 -04:00			`}`

Fix up for newer models on reset context. This fixes the model from totally failing after a reset context. 2023-06-04 19:31:00 -04:00			`std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str) const`
Backend prompt dedup (#822) * Deduplicated prompt() function code 2023-06-04 08:59:24 -04:00			`{`
Fix up for newer models on reset context. This fixes the model from totally failing after a reset context. 2023-06-04 19:31:00 -04:00			`const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() \|\| ctx.tokens.front() != llama_token_bos());`
Backend prompt dedup (#822) * Deduplicated prompt() function code 2023-06-04 08:59:24 -04:00			`std::vector<LLModel::Token> fres(str.size()+4);`
Fix up for newer models on reset context. This fixes the model from totally failing after a reset context. 2023-06-04 19:31:00 -04:00			`auto fres_len = llama_tokenize(d_ptr->ctx, str.c_str(), fres.data(), fres.size(), useBOS);`
Backend prompt dedup (#822) * Deduplicated prompt() function code 2023-06-04 08:59:24 -04:00			`fres.resize(fres_len);`
			`return fres;`
			`}`
Add llama.cpp support for loading llama based models in the gui. We now support loading both gptj derived models and llama derived models. 2023-04-15 15:57:32 -04:00
llmodel: change tokenToString to not use string_view (#968) fixes a definite use-after-free and likely avoids some other potential ones - std::string will convert to a std::string_view automatically but as soon as the std::string in question goes out of scope it is already freed and the string_view is pointing at freed memory - this is mostly fine if its returning a reference to the tokenizer's internal vocab table but it's, imo, too easy to return a reference to a dynamically constructed string with this as replit is doing (and unfortunately needs to do to convert the internal whitespace replacement symbol back to a space) 2023-06-13 07:14:02 -04:00			`std::string LLamaModel::tokenToString(Token id) const`
Backend prompt dedup (#822) * Deduplicated prompt() function code 2023-06-04 08:59:24 -04:00			`{`
			`return llama_token_to_str(d_ptr->ctx, id);`
			`}`
Add llama.cpp support for loading llama based models in the gui. We now support loading both gptj derived models and llama derived models. 2023-04-15 15:57:32 -04:00
Backend prompt dedup (#822) * Deduplicated prompt() function code 2023-06-04 08:59:24 -04:00			`LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const`
			`{`
			`const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());`
			`return llama_sample_top_p_top_k(d_ptr->ctx,`
			`promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,`
			`n_prev_toks, promptCtx.top_k, promptCtx.top_p, promptCtx.temp,`
			`promptCtx.repeat_penalty);`
			`}`
Add llama.cpp support for loading llama based models in the gui. We now support loading both gptj derived models and llama derived models. 2023-04-15 15:57:32 -04:00
Backend prompt dedup (#822) * Deduplicated prompt() function code 2023-06-04 08:59:24 -04:00			`bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const`
			`{`
When recalculating context we can't erase the BOS. 2023-06-12 09:21:28 -04:00			`// When we recalculate context we could have erased the original BOS token... we need to replace it`
			`const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() \|\| ctx.tokens.front() != llama_token_bos());`
			`if (useBOS) {`
			`std::vector<int32_t> myTokens;`
			`myTokens.push_back(llama_token_bos());`
			`myTokens.insert(myTokens.end(), tokens.begin(), tokens.end());`
			`ctx.n_past += 1;`
			`return llama_eval(d_ptr->ctx, myTokens.data(), myTokens.size(), ctx.n_past, d_ptr->n_threads) == 0;`
			`} else`
			`return llama_eval(d_ptr->ctx, tokens.data(), tokens.size(), ctx.n_past, d_ptr->n_threads) == 0;`
Backend prompt dedup (#822) * Deduplicated prompt() function code 2023-06-04 08:59:24 -04:00			`}`
Add llama.cpp support for loading llama based models in the gui. We now support loading both gptj derived models and llama derived models. 2023-04-15 15:57:32 -04:00
Backend prompt dedup (#822) * Deduplicated prompt() function code 2023-06-04 08:59:24 -04:00			`int32_t LLamaModel::contextLength() const`
			`{`
			`return llama_n_ctx(d_ptr->ctx);`
Add llama.cpp support for loading llama based models in the gui. We now support loading both gptj derived models and llama derived models. 2023-04-15 15:57:32 -04:00			`}`
Infinite context window through trimming. 2023-04-25 11:20:51 -04:00
Backend prompt dedup (#822) * Deduplicated prompt() function code 2023-06-04 08:59:24 -04:00			`const std::vector<LLModel::Token> &LLamaModel::endTokens() const`
Infinite context window through trimming. 2023-04-25 11:20:51 -04:00			`{`
Backend prompt dedup (#822) * Deduplicated prompt() function code 2023-06-04 08:59:24 -04:00			`static const std::vector<LLModel::Token> fres = {llama_token_eos()};`
			`return fres;`
Infinite context window through trimming. 2023-04-25 11:20:51 -04:00			`}`
Dlopen backend 5 (#779) Major change to the backend that allows for pluggable versions of llama.cpp/ggml. This was squashed merged from dlopen_backend_5 where the history is preserved. 2023-05-31 17:04:01 -04:00
			`#if defined(_WIN32)`
			`#define DLL_EXPORT __declspec(dllexport)`
			`#else`
			`#define DLL_EXPORT __attribute__ ((visibility ("default")))`
			`#endif`

			`extern "C" {`
			`DLL_EXPORT bool is_g4a_backend_model_implementation() {`
			`return true;`
			`}`

			`DLL_EXPORT const char *get_model_type() {`
			`return modelType_;`
			`}`

			`DLL_EXPORT const char *get_build_variant() {`
			`return GGML_BUILD_VARIANT;`
			`}`

			`DLL_EXPORT bool magic_match(std::istream& f) {`
			`// Check magic`
			`uint32_t magic = 0;`
			`f.read(reinterpret_cast<char*>(&magic), sizeof(magic));`
			`if (magic != 0x67676a74) return false;`
			`// Check version`
			`uint32_t version = 0;`
			`f.read(reinterpret_cast<char*>(&version), sizeof(version));`
Metal+LLama take two (#929) Support latest llama with Metal --------- Co-authored-by: Adam Treat <adam@nomic.ai> Co-authored-by: niansa/tuxifan <tuxifan@posteo.de> 2023-06-09 16:48:46 -04:00			`if (!(version LLAMA_VERSIONS)) {`
			`return false;`
			`}`
add Falcon 7B model Tested with https://huggingface.co/TheBloke/falcon-7b-instruct-GGML/blob/main/falcon7b-instruct.ggmlv3.q4_0.bin 2023-06-26 22:53:57 -04:00			`llama_file_hparams hparams;`
			`f.read(reinterpret_cast<char*>(&hparams), sizeof(hparams));`
			`if (!(hparams.n_vocab >= 32000 && hparams.n_vocab <= 32100)) {`
			`return false; // not a llama.`
			`}`
Metal+LLama take two (#929) Support latest llama with Metal --------- Co-authored-by: Adam Treat <adam@nomic.ai> Co-authored-by: niansa/tuxifan <tuxifan@posteo.de> 2023-06-09 16:48:46 -04:00			`#ifdef GGML_USE_METAL`
			`// Check quant supported on metal`
			`// skip fields`
add Falcon 7B model Tested with https://huggingface.co/TheBloke/falcon-7b-instruct-GGML/blob/main/falcon7b-instruct.ggmlv3.q4_0.bin 2023-06-26 22:53:57 -04:00			`switch(hparams.ftype) {`
Metal+LLama take two (#929) Support latest llama with Metal --------- Co-authored-by: Adam Treat <adam@nomic.ai> Co-authored-by: niansa/tuxifan <tuxifan@posteo.de> 2023-06-09 16:48:46 -04:00			`// currently supported on Metal https://github.com/ggerganov/llama.cpp/blob/ae9663f1887513e152839e91f61c513075a19422/ggml-metal.m#L51-L55`
			`case LLAMA_FTYPE_MOSTLY_F16:`
			`case LLAMA_FTYPE_MOSTLY_Q2_K:`
			`case LLAMA_FTYPE_MOSTLY_Q4_0:`
			`case LLAMA_FTYPE_MOSTLY_Q6_K:`
			`case LLAMA_FTYPE_MOSTLY_Q4_K_S:`
			`case LLAMA_FTYPE_MOSTLY_Q4_K_M:`
			`return true;`
			`default: // unsupported quant-type for Metal`
			`return false;`
			`}`
			`#endif`
			`return true;`
Dlopen backend 5 (#779) Major change to the backend that allows for pluggable versions of llama.cpp/ggml. This was squashed merged from dlopen_backend_5 where the history is preserved. 2023-05-31 17:04:01 -04:00			`}`

			`DLL_EXPORT LLModel *construct() {`
			`return new LLamaModel;`
			`}`
			`}`