mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2024-10-01 01:06:10 -04:00
backend: rebase llama.cpp on upstream as of Sep 26th (#2998)
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
parent
8bd937eb68
commit
f9d6be8afb
@ -1 +1 @@
|
||||
Subproject commit ced74fbad4b258507f3ec06e77eec9445583511a
|
||||
Subproject commit b3b5c0571eda3065035a7f25f7b84640b159d821
|
@ -7,6 +7,7 @@
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <optional>
|
||||
#include <span>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
@ -149,9 +150,9 @@ public:
|
||||
virtual bool isEmbeddingModel(const std::string &modelPath) const { (void)modelPath; return false; }
|
||||
virtual bool isModelLoaded() const = 0;
|
||||
virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
|
||||
virtual size_t stateSize() const { return 0; }
|
||||
virtual size_t saveState(uint8_t *dest) const { (void)dest; return 0; }
|
||||
virtual size_t restoreState(const uint8_t *src) { (void)src; return 0; }
|
||||
virtual size_t stateSize() const = 0;
|
||||
virtual size_t saveState(std::span<uint8_t> dest) const = 0;
|
||||
virtual size_t restoreState(std::span<const uint8_t> src) = 0;
|
||||
|
||||
// This method requires the model to return true from supportsCompletion otherwise it will throw
|
||||
// an error
|
||||
@ -215,7 +216,8 @@ protected:
|
||||
virtual std::vector<Token> tokenize(PromptContext &ctx, std::string_view str, bool special = false) = 0;
|
||||
virtual bool isSpecialToken(Token id) const = 0;
|
||||
virtual std::string tokenToString(Token id) const = 0;
|
||||
virtual Token sampleToken(PromptContext &ctx) const = 0;
|
||||
virtual void initSampler(PromptContext &ctx) = 0;
|
||||
virtual Token sampleToken() const = 0;
|
||||
virtual bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const = 0;
|
||||
virtual void shiftContext(PromptContext &promptCtx) = 0;
|
||||
virtual int32_t contextLength() const = 0;
|
||||
|
@ -148,18 +148,20 @@ uint64_t llmodel_get_state_size(llmodel_model model);
|
||||
* NOTE: This state data is specific to the type of model you have created.
|
||||
* @param model A pointer to the llmodel_model instance.
|
||||
* @param dest A pointer to the destination.
|
||||
* @return the number of bytes copied
|
||||
* @param size The size of the destination buffer.
|
||||
* @return the number of bytes copied, or zero on error.
|
||||
*/
|
||||
uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest);
|
||||
uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest, uint64_t size);
|
||||
|
||||
/**
|
||||
* Restores the internal state of the model using data from the specified address.
|
||||
* NOTE: This state data is specific to the type of model you have created.
|
||||
* @param model A pointer to the llmodel_model instance.
|
||||
* @param src A pointer to the src.
|
||||
* @return the number of bytes read
|
||||
* @param src A pointer to the state data.
|
||||
* @param size The size of the source data.
|
||||
* @return The number of bytes read, or zero on error.
|
||||
*/
|
||||
uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src);
|
||||
uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src, size_t size);
|
||||
|
||||
/**
|
||||
* Generate a response using the model.
|
||||
|
@ -978,10 +978,13 @@ function(include_ggml SUFFIX)
|
||||
|
||||
add_library(llama${SUFFIX} STATIC
|
||||
${DIRECTORY}/include/llama.h
|
||||
${DIRECTORY}/src/llama-grammar.cpp
|
||||
${DIRECTORY}/src/llama-sampling.cpp
|
||||
${DIRECTORY}/src/llama-vocab.cpp
|
||||
${DIRECTORY}/src/llama.cpp
|
||||
${DIRECTORY}/src/unicode.h
|
||||
${DIRECTORY}/src/unicode.cpp
|
||||
${DIRECTORY}/src/unicode-data.cpp
|
||||
${DIRECTORY}/src/unicode.cpp
|
||||
${DIRECTORY}/src/unicode.h
|
||||
)
|
||||
|
||||
target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY}/include ${DIRECTORY}/ggml/include)
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include "llamamodel_impl.h"
|
||||
|
||||
#include "llmodel.h"
|
||||
#include "utils.h"
|
||||
|
||||
#include <ggml.h>
|
||||
#include <llama.h>
|
||||
@ -103,26 +104,34 @@ static bool llama_verbose()
|
||||
return var && *var;
|
||||
}
|
||||
|
||||
static void llama_log_callback(enum ggml_log_level level, const char *text, void *userdata)
|
||||
static void llama_log_callback(ggml_log_level level, const char *text, void *userdata, bool warn)
|
||||
{
|
||||
(void)userdata;
|
||||
if (llama_verbose() || level <= GGML_LOG_LEVEL_ERROR) {
|
||||
fputs(text, stderr);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
static void cuda_log_callback(enum ggml_log_level level, const char *text, void *userdata)
|
||||
{
|
||||
(void)userdata;
|
||||
if (llama_verbose() || level <= GGML_LOG_LEVEL_WARN) {
|
||||
fputs(text, stderr);
|
||||
static ggml_log_level lastlevel = GGML_LOG_LEVEL_NONE;
|
||||
if (!llama_verbose()) {
|
||||
auto efflevel = level == GGML_LOG_LEVEL_CONT ? lastlevel : level;
|
||||
lastlevel = efflevel;
|
||||
switch (efflevel) {
|
||||
case GGML_LOG_LEVEL_CONT:
|
||||
UNREACHABLE();
|
||||
break;
|
||||
case GGML_LOG_LEVEL_WARN:
|
||||
if (warn) break;
|
||||
[[fallthrough]];
|
||||
case GGML_LOG_LEVEL_NONE: // not used?
|
||||
case GGML_LOG_LEVEL_INFO:
|
||||
case GGML_LOG_LEVEL_DEBUG:
|
||||
return; // suppress
|
||||
case GGML_LOG_LEVEL_ERROR:
|
||||
;
|
||||
}
|
||||
}
|
||||
|
||||
fputs(text, stderr);
|
||||
}
|
||||
#endif
|
||||
|
||||
struct gpt_params {
|
||||
int32_t seed = -1; // RNG seed
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
|
||||
// sampling parameters
|
||||
@ -137,44 +146,6 @@ struct gpt_params {
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
};
|
||||
|
||||
static llama_token llama_sample_top_p_top_k(
|
||||
llama_context *ctx,
|
||||
const llama_token *last_n_tokens_data,
|
||||
int last_n_tokens_size,
|
||||
int top_k,
|
||||
float top_p,
|
||||
float min_p,
|
||||
float temp,
|
||||
float repeat_penalty) {
|
||||
auto logits = llama_get_logits_ith(ctx, -1);
|
||||
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
// Populate initial list of all candidates
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (int token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
}
|
||||
llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
|
||||
// Sample repeat penalty
|
||||
llama_sample_repetition_penalties(nullptr, &candidates_p, last_n_tokens_data, last_n_tokens_size, repeat_penalty, 0.0f, 0.0f);
|
||||
|
||||
llama_token id;
|
||||
if (temp == 0.0) {
|
||||
// greedy sampling, no probs
|
||||
id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
} else {
|
||||
// temperature sampling
|
||||
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
|
||||
llama_sample_tail_free(ctx, &candidates_p, 1.0f, 1);
|
||||
llama_sample_typical(ctx, &candidates_p, 1.0f, 1);
|
||||
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
|
||||
llama_sample_min_p(ctx, &candidates_p, min_p, 1);
|
||||
llama_sample_temp(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token(ctx, &candidates_p);
|
||||
}
|
||||
return id;
|
||||
}
|
||||
|
||||
const char *get_arch_name(gguf_context *ctx_gguf)
|
||||
{
|
||||
const int kid = gguf_find_key(ctx_gguf, "general.architecture");
|
||||
@ -241,21 +212,26 @@ cleanup:
|
||||
}
|
||||
|
||||
struct LLamaPrivate {
|
||||
const std::string modelPath;
|
||||
bool modelLoaded = false;
|
||||
int device = -1;
|
||||
std::string deviceName;
|
||||
llama_model *model = nullptr;
|
||||
llama_context *ctx = nullptr;
|
||||
llama_model_params model_params;
|
||||
llama_context_params ctx_params;
|
||||
int64_t n_threads = 0;
|
||||
std::vector<LLModel::Token> end_tokens;
|
||||
const char *backend_name = nullptr;
|
||||
bool modelLoaded = false;
|
||||
int device = -1;
|
||||
std::string deviceName;
|
||||
int64_t n_threads = 0;
|
||||
std::vector<LLModel::Token> end_tokens;
|
||||
const char *backend_name = nullptr;
|
||||
|
||||
llama_model *model = nullptr;
|
||||
llama_context *ctx = nullptr;
|
||||
llama_model_params model_params;
|
||||
llama_context_params ctx_params;
|
||||
llama_sampler *sampler_chain;
|
||||
};
|
||||
|
||||
LLamaModel::LLamaModel()
|
||||
: d_ptr(new LLamaPrivate) {}
|
||||
: d_ptr(std::make_unique<LLamaPrivate>())
|
||||
{
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
d_ptr->sampler_chain = llama_sampler_chain_init(sparams);
|
||||
}
|
||||
|
||||
// default hparams (LLaMA 7B)
|
||||
struct llama_file_hparams {
|
||||
@ -444,10 +420,9 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
||||
}
|
||||
}
|
||||
|
||||
d_ptr->ctx_params.n_ctx = n_ctx;
|
||||
d_ptr->ctx_params.seed = params.seed;
|
||||
d_ptr->ctx_params.type_k = params.kv_type;
|
||||
d_ptr->ctx_params.type_v = params.kv_type;
|
||||
d_ptr->ctx_params.n_ctx = n_ctx;
|
||||
d_ptr->ctx_params.type_k = params.kv_type;
|
||||
d_ptr->ctx_params.type_v = params.kv_type;
|
||||
|
||||
// The new batch API provides space for n_vocab*n_tokens logits. Tell llama.cpp early
|
||||
// that we want this many logits so the state serializes consistently.
|
||||
@ -513,6 +488,7 @@ LLamaModel::~LLamaModel()
|
||||
llama_free(d_ptr->ctx);
|
||||
}
|
||||
llama_free_model(d_ptr->model);
|
||||
llama_sampler_free(d_ptr->sampler_chain);
|
||||
}
|
||||
|
||||
bool LLamaModel::isModelLoaded() const
|
||||
@ -522,18 +498,17 @@ bool LLamaModel::isModelLoaded() const
|
||||
|
||||
size_t LLamaModel::stateSize() const
|
||||
{
|
||||
return llama_get_state_size(d_ptr->ctx);
|
||||
return llama_state_get_size(d_ptr->ctx);
|
||||
}
|
||||
|
||||
size_t LLamaModel::saveState(uint8_t *dest) const
|
||||
size_t LLamaModel::saveState(std::span<uint8_t> dest) const
|
||||
{
|
||||
return llama_copy_state_data(d_ptr->ctx, dest);
|
||||
return llama_state_get_data(d_ptr->ctx, dest.data(), dest.size());
|
||||
}
|
||||
|
||||
size_t LLamaModel::restoreState(const uint8_t *src)
|
||||
size_t LLamaModel::restoreState(std::span<const uint8_t> src)
|
||||
{
|
||||
// const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540
|
||||
return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
|
||||
return llama_state_set_data(d_ptr->ctx, src.data(), src.size());
|
||||
}
|
||||
|
||||
std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, std::string_view str, bool special)
|
||||
@ -573,13 +548,50 @@ std::string LLamaModel::tokenToString(Token id) const
|
||||
return std::string(result.data(), result.size());
|
||||
}
|
||||
|
||||
LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
|
||||
void LLamaModel::initSampler(PromptContext &promptCtx)
|
||||
{
|
||||
const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
|
||||
return llama_sample_top_p_top_k(d_ptr->ctx,
|
||||
promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
|
||||
n_prev_toks, promptCtx.top_k, promptCtx.top_p, promptCtx.min_p, promptCtx.temp,
|
||||
promptCtx.repeat_penalty);
|
||||
auto *model = d_ptr->model;
|
||||
auto *chain = d_ptr->sampler_chain;
|
||||
|
||||
// clear sampler chain
|
||||
for (int i = llama_sampler_chain_n(chain) - 1; i >= 0; i--) {
|
||||
auto *smpl = llama_sampler_chain_remove(chain, i);
|
||||
llama_sampler_free(smpl);
|
||||
}
|
||||
|
||||
// build new chain
|
||||
llama_sampler_chain_add(chain,
|
||||
llama_sampler_init_penalties(
|
||||
llama_n_vocab(model),
|
||||
llama_token_eos(model),
|
||||
llama_token_nl(model),
|
||||
promptCtx.repeat_last_n,
|
||||
promptCtx.repeat_penalty,
|
||||
// TODO(jared): consider making the below configurable
|
||||
/*penalty_freq*/ 0.0f,
|
||||
/*penalty_present*/ 0.0f,
|
||||
/*penalize_nl*/ true,
|
||||
/*ignore_eos*/ false
|
||||
)
|
||||
);
|
||||
if (promptCtx.temp == 0.0f) {
|
||||
llama_sampler_chain_add(chain, llama_sampler_init_greedy());
|
||||
} else {
|
||||
struct llama_sampler *samplers[] = {
|
||||
llama_sampler_init_top_k(promptCtx.top_k),
|
||||
llama_sampler_init_top_p(promptCtx.top_p, 1),
|
||||
llama_sampler_init_min_p(promptCtx.min_p, 1),
|
||||
llama_sampler_init_temp(promptCtx.temp),
|
||||
llama_sampler_init_dist(LLAMA_DEFAULT_SEED)
|
||||
};
|
||||
for (auto *smpl : samplers)
|
||||
llama_sampler_chain_add(chain, smpl);
|
||||
}
|
||||
}
|
||||
|
||||
LLModel::Token LLamaModel::sampleToken() const
|
||||
{
|
||||
return llama_sampler_sample(d_ptr->sampler_chain, d_ptr->ctx, -1);
|
||||
}
|
||||
|
||||
bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
|
||||
@ -1227,9 +1239,9 @@ DLL_EXPORT bool is_arch_supported(const char *arch)
|
||||
|
||||
DLL_EXPORT LLModel *construct()
|
||||
{
|
||||
llama_log_set(llama_log_callback, nullptr);
|
||||
llama_log_set([](auto l, auto t, auto u) { llama_log_callback(l, t, u, false); }, nullptr);
|
||||
#ifdef GGML_USE_CUDA
|
||||
ggml_backend_cuda_log_set_callback(cuda_log_callback, nullptr);
|
||||
ggml_backend_cuda_log_set_callback([](auto l, auto t, auto u) { llama_log_callback(l, t, u, true); }, nullptr);
|
||||
#endif
|
||||
return new LLamaModel;
|
||||
}
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include "llmodel.h"
|
||||
|
||||
#include <memory>
|
||||
#include <span>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
@ -27,8 +28,8 @@ public:
|
||||
bool isModelLoaded() const override;
|
||||
size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
|
||||
size_t stateSize() const override;
|
||||
size_t saveState(uint8_t *dest) const override;
|
||||
size_t restoreState(const uint8_t *src) override;
|
||||
size_t saveState(std::span<uint8_t> dest) const override;
|
||||
size_t restoreState(std::span<const uint8_t> src) override;
|
||||
void setThreadCount(int32_t n_threads) override;
|
||||
int32_t threadCount() const override;
|
||||
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
|
||||
@ -56,7 +57,8 @@ protected:
|
||||
std::vector<Token> tokenize(PromptContext &ctx, std::string_view str, bool special) override;
|
||||
bool isSpecialToken(Token id) const override;
|
||||
std::string tokenToString(Token id) const override;
|
||||
Token sampleToken(PromptContext &ctx) const override;
|
||||
void initSampler(PromptContext &ctx) override;
|
||||
Token sampleToken() const override;
|
||||
bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
|
||||
void shiftContext(PromptContext &promptCtx) override;
|
||||
int32_t contextLength() const override;
|
||||
|
@ -91,16 +91,16 @@ uint64_t llmodel_get_state_size(llmodel_model model)
|
||||
return wrapper->llModel->stateSize();
|
||||
}
|
||||
|
||||
uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest)
|
||||
uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest, uint64_t size)
|
||||
{
|
||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||
return wrapper->llModel->saveState(dest);
|
||||
return wrapper->llModel->saveState({dest, size_t(size)});
|
||||
}
|
||||
|
||||
uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src)
|
||||
uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src, uint64_t size)
|
||||
{
|
||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||
return wrapper->llModel->restoreState(src);
|
||||
return wrapper->llModel->restoreState({src, size_t(size)});
|
||||
}
|
||||
|
||||
void llmodel_prompt(llmodel_model model, const char *prompt,
|
||||
|
@ -244,6 +244,8 @@ void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)>
|
||||
return;
|
||||
}
|
||||
|
||||
initSampler(promptCtx);
|
||||
|
||||
std::string cachedResponse;
|
||||
std::vector<Token> cachedTokens;
|
||||
int n_predicted = 0;
|
||||
@ -251,7 +253,7 @@ void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)>
|
||||
// Predict next tokens
|
||||
for (bool stop = false; !stop;) {
|
||||
// Sample next token
|
||||
std::optional<Token> new_tok = sampleToken(promptCtx);
|
||||
std::optional<Token> new_tok = sampleToken();
|
||||
std::string new_piece = tokenToString(new_tok.value());
|
||||
cachedTokens.push_back(new_tok.value());
|
||||
cachedResponse += new_piece;
|
||||
|
@ -1,49 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <ggml.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
struct llm_buffer {
|
||||
uint8_t * addr = NULL;
|
||||
size_t size = 0;
|
||||
|
||||
void resize(size_t size) {
|
||||
delete[] addr;
|
||||
addr = new uint8_t[size];
|
||||
this->size = size;
|
||||
}
|
||||
|
||||
~llm_buffer() {
|
||||
delete[] addr;
|
||||
}
|
||||
};
|
||||
|
||||
struct llm_kv_cache {
|
||||
struct ggml_tensor * k;
|
||||
struct ggml_tensor * v;
|
||||
|
||||
struct ggml_context * ctx = NULL;
|
||||
|
||||
llm_buffer buf;
|
||||
|
||||
int n; // number of tokens currently in the cache
|
||||
|
||||
~llm_kv_cache() {
|
||||
if (ctx) {
|
||||
ggml_free(ctx);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
inline void ggml_graph_compute_g4a(llm_buffer& buf, ggml_cgraph * graph, int n_threads)
|
||||
{
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
buf.resize(plan.work_size);
|
||||
plan.work_data = buf.addr;
|
||||
}
|
||||
ggml_graph_compute(graph, &plan);
|
||||
}
|
@ -1,339 +0,0 @@
|
||||
#include "utils.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <iterator>
|
||||
#include <regex>
|
||||
#include <utility>
|
||||
|
||||
void replace(std::string & str, const std::string & needle, const std::string & replacement)
|
||||
{
|
||||
size_t pos = 0;
|
||||
while ((pos = str.find(needle, pos)) != std::string::npos) {
|
||||
str.replace(pos, needle.length(), replacement);
|
||||
pos += replacement.length();
|
||||
}
|
||||
}
|
||||
|
||||
std::map<std::string, int32_t> json_parse(const std::string & fname)
|
||||
{
|
||||
std::map<std::string, int32_t> result;
|
||||
|
||||
// read file into string
|
||||
std::string json;
|
||||
{
|
||||
std::ifstream ifs(fname);
|
||||
if (!ifs) {
|
||||
fprintf(stderr, "Failed to open %s\n", fname.c_str());
|
||||
exit(1);
|
||||
}
|
||||
|
||||
json = std::string((std::istreambuf_iterator<char>(ifs)),
|
||||
(std::istreambuf_iterator<char>()));
|
||||
}
|
||||
|
||||
if (json[0] != '{') {
|
||||
return result;
|
||||
}
|
||||
|
||||
// parse json
|
||||
{
|
||||
bool has_key = false;
|
||||
bool in_token = false;
|
||||
|
||||
std::string str_key = "";
|
||||
std::string str_val = "";
|
||||
|
||||
int n = json.size();
|
||||
for (int i = 1; i < n; ++i) {
|
||||
if (!in_token) {
|
||||
if (json[i] == ' ') continue;
|
||||
if (json[i] == '"') {
|
||||
in_token = true;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if (json[i] == '\\' && i+1 < n) {
|
||||
if (has_key == false) {
|
||||
str_key += json[i];
|
||||
} else {
|
||||
str_val += json[i];
|
||||
}
|
||||
++i;
|
||||
} else if (json[i] == '"') {
|
||||
if (has_key == false) {
|
||||
has_key = true;
|
||||
++i;
|
||||
while (json[i] == ' ') ++i;
|
||||
++i; // :
|
||||
while (json[i] == ' ') ++i;
|
||||
if (json[i] != '\"') {
|
||||
while (json[i] != ',' && json[i] != '}') {
|
||||
str_val += json[i++];
|
||||
}
|
||||
has_key = false;
|
||||
} else {
|
||||
in_token = true;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
has_key = false;
|
||||
}
|
||||
|
||||
::replace(str_key, "\\u0120", " " ); // \u0120 -> space
|
||||
::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
|
||||
::replace(str_key, "\\\"", "\""); // \\\" -> "
|
||||
|
||||
try {
|
||||
result[str_key] = std::stoi(str_val);
|
||||
} catch (...) {
|
||||
//fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
|
||||
|
||||
}
|
||||
str_key = "";
|
||||
str_val = "";
|
||||
in_token = false;
|
||||
continue;
|
||||
}
|
||||
if (has_key == false) {
|
||||
str_key += json[i];
|
||||
} else {
|
||||
str_val += json[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<gpt_vocab::id> gpt_tokenize_inner(const gpt_vocab & vocab, const std::string & text)
|
||||
{
|
||||
std::vector<std::string> words;
|
||||
|
||||
// first split the text into words
|
||||
{
|
||||
std::string str = text;
|
||||
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
||||
|
||||
std::regex re(pat);
|
||||
std::smatch m;
|
||||
|
||||
while (std::regex_search(str, m, re)) {
|
||||
for (auto x : m) {
|
||||
words.push_back(x);
|
||||
}
|
||||
str = m.suffix();
|
||||
}
|
||||
}
|
||||
|
||||
// find the longest tokens that form the words:
|
||||
std::vector<gpt_vocab::id> tokens;
|
||||
for (const auto & word : words) {
|
||||
if (word.size() == 0) continue;
|
||||
|
||||
int i = 0;
|
||||
int n = word.size();
|
||||
while (i < n) {
|
||||
int j = n;
|
||||
while (j > i) {
|
||||
auto it = vocab.token_to_id.find(word.substr(i, j-i));
|
||||
if (it != vocab.token_to_id.end()) {
|
||||
tokens.push_back(it->second);
|
||||
i = j;
|
||||
break;
|
||||
}
|
||||
--j;
|
||||
}
|
||||
if (i == n) {
|
||||
break;
|
||||
}
|
||||
if (j == i) {
|
||||
auto sub = word.substr(i, 1);
|
||||
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
|
||||
tokens.push_back(vocab.token_to_id.at(sub));
|
||||
} else {
|
||||
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
|
||||
}
|
||||
++i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
std::string regex_escape(const std::string &s)
|
||||
{
|
||||
static const std::regex metacharacters(R"([\.\^\$\-\+\(\)\[\]\{\}\|\?\*])");
|
||||
return std::regex_replace(s, metacharacters, "\\$&");
|
||||
}
|
||||
|
||||
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text)
|
||||
{
|
||||
// Generate the subpattern from the special_tokens vector if it's not empty
|
||||
if (!vocab.special_tokens.empty()) {
|
||||
std::vector<gpt_vocab::id> out;
|
||||
std::vector<std::string> chunks;
|
||||
std::string str = text;
|
||||
std::string special_tokens_subpattern;
|
||||
for (const auto &token : vocab.special_tokens) {
|
||||
if (!special_tokens_subpattern.empty()) {
|
||||
special_tokens_subpattern += "|";
|
||||
}
|
||||
special_tokens_subpattern += regex_escape(token);
|
||||
}
|
||||
std::regex re(special_tokens_subpattern);
|
||||
std::smatch m;
|
||||
while (std::regex_search(str, m, re)) {
|
||||
auto tok = vocab.token_to_id.find(m.str());
|
||||
if (tok != vocab.token_to_id.end()) {
|
||||
auto tokid = tok->second;
|
||||
auto pfxtoks = gpt_tokenize_inner(vocab, m.prefix());
|
||||
out.insert(out.end(), pfxtoks.begin(), pfxtoks.end());
|
||||
out.push_back(tokid);
|
||||
str = m.suffix();
|
||||
}
|
||||
}
|
||||
if (!str.empty()) {
|
||||
auto tokrest = gpt_tokenize_inner(vocab, str);
|
||||
out.insert(out.end(), tokrest.begin(), tokrest.end());
|
||||
}
|
||||
return out;
|
||||
} else {
|
||||
return gpt_tokenize_inner(vocab, text);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab)
|
||||
{
|
||||
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
|
||||
|
||||
vocab.token_to_id = ::json_parse(fname);
|
||||
|
||||
for (const auto & kv : vocab.token_to_id) {
|
||||
vocab.id_to_token[kv.second] = kv.first;
|
||||
}
|
||||
|
||||
printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
|
||||
|
||||
// print the vocabulary
|
||||
//for (auto kv : vocab.token_to_id) {
|
||||
// printf("'%s' -> %d\n", kv.first.data(), kv.second);
|
||||
//}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
gpt_vocab::id gpt_sample_top_k_top_p(
|
||||
const size_t actualVocabSize,
|
||||
const int32_t * last_n_tokens_data,
|
||||
int last_n_tokens_size,
|
||||
const std::vector<float> logits,
|
||||
int top_k,
|
||||
double top_p,
|
||||
double temp,
|
||||
float repeat_penalty,
|
||||
std::mt19937 & rng) {
|
||||
int n_logits = actualVocabSize;
|
||||
|
||||
const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
|
||||
const auto * plogits = logits.data();
|
||||
|
||||
if (temp <= 0) {
|
||||
// select the token with the highest logit directly
|
||||
float max_logit = plogits[0];
|
||||
gpt_vocab::id max_id = 0;
|
||||
|
||||
for (int i = 1; i < n_logits; ++i) {
|
||||
if (plogits[i] > max_logit) {
|
||||
max_logit = plogits[i];
|
||||
max_id = i;
|
||||
}
|
||||
}
|
||||
return max_id;
|
||||
}
|
||||
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
|
||||
logits_id.reserve(n_logits);
|
||||
|
||||
{
|
||||
const float scale = 1.0f/temp;
|
||||
for (int i = 0; i < n_logits; ++i) {
|
||||
// repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
|
||||
// credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
|
||||
if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
|
||||
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
|
||||
if (plogits[i] < 0.0f) {
|
||||
logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
|
||||
} else {
|
||||
logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
|
||||
}
|
||||
} else {
|
||||
logits_id.push_back(std::make_pair(plogits[i]*scale, i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// find the top K tokens
|
||||
std::partial_sort(
|
||||
logits_id.begin(),
|
||||
logits_id.begin() + top_k, logits_id.end(),
|
||||
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
|
||||
return a.first > b.first;
|
||||
});
|
||||
|
||||
logits_id.resize(top_k);
|
||||
|
||||
double maxl = -INFINITY;
|
||||
for (const auto & kv : logits_id) {
|
||||
maxl = std::max(maxl, kv.first);
|
||||
}
|
||||
|
||||
// compute probs for the top K tokens
|
||||
std::vector<double> probs;
|
||||
probs.reserve(logits_id.size());
|
||||
|
||||
double sum = 0.0;
|
||||
for (const auto & kv : logits_id) {
|
||||
double p = exp(kv.first - maxl);
|
||||
probs.push_back(p);
|
||||
sum += p;
|
||||
}
|
||||
|
||||
// normalize the probs
|
||||
for (auto & p : probs) {
|
||||
p /= sum;
|
||||
}
|
||||
|
||||
if (top_p < 1.0f) {
|
||||
double cumsum = 0.0f;
|
||||
for (int i = 0; i < top_k; i++) {
|
||||
cumsum += probs[i];
|
||||
if (cumsum >= top_p) {
|
||||
top_k = i + 1;
|
||||
probs.resize(top_k);
|
||||
logits_id.resize(top_k);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
cumsum = 1.0/cumsum;
|
||||
for (int i = 0; i < (int) probs.size(); i++) {
|
||||
probs[i] *= cumsum;
|
||||
}
|
||||
}
|
||||
|
||||
//printf("\n");
|
||||
//for (int i = 0; i < (int) probs.size(); i++) {
|
||||
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
|
||||
//}
|
||||
//exit(0);
|
||||
|
||||
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
||||
int idx = dist(rng);
|
||||
|
||||
return logits_id[idx].second;
|
||||
}
|
@ -1,101 +1,17 @@
|
||||
// Various helper functions and utilities
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <cassert>
|
||||
|
||||
//
|
||||
// General purpose inline functions
|
||||
//
|
||||
constexpr inline unsigned long long operator ""_MiB(unsigned long long bytes)
|
||||
{
|
||||
return bytes*1024*1024;
|
||||
}
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
//
|
||||
|
||||
struct gpt_params {
|
||||
int32_t seed = -1; // RNG seed
|
||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||
int32_t n_predict = 200; // new tokens to predict
|
||||
|
||||
// sampling parameters
|
||||
int32_t top_k = 40;
|
||||
float top_p = 0.9f;
|
||||
float temp = 0.9f;
|
||||
|
||||
int32_t n_batch = 8; // batch size for prompt processing
|
||||
|
||||
std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
|
||||
std::string prompt;
|
||||
};
|
||||
|
||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
||||
|
||||
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
||||
|
||||
std::string gpt_random_prompt(std::mt19937 & rng);
|
||||
|
||||
//
|
||||
// Vocab utils
|
||||
//
|
||||
|
||||
struct gpt_vocab {
|
||||
using id = int32_t;
|
||||
using token = std::string;
|
||||
|
||||
std::map<token, id> token_to_id;
|
||||
std::map<id, token> id_to_token;
|
||||
std::vector<std::string> special_tokens;
|
||||
|
||||
void add_special_token(const std::string &token) {
|
||||
special_tokens.push_back(token);
|
||||
}
|
||||
};
|
||||
|
||||
void replace(std::string & str, const std::string & needle, const std::string & replacement);
|
||||
|
||||
// poor-man's JSON parsing
|
||||
std::map<std::string, int32_t> json_parse(const std::string & fname);
|
||||
|
||||
// split text into tokens
|
||||
//
|
||||
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
|
||||
//
|
||||
// Regex (Python):
|
||||
// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
|
||||
//
|
||||
// Regex (C++):
|
||||
// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
|
||||
//
|
||||
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
|
||||
|
||||
// load the tokens from encoder.json
|
||||
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
|
||||
|
||||
// sample next token given probabilities for each embedding
|
||||
//
|
||||
// - consider only the top K tokens
|
||||
// - from them, consider only the top tokens with cumulative probability > P
|
||||
//
|
||||
// TODO: not sure if this implementation is correct
|
||||
//
|
||||
gpt_vocab::id gpt_sample_top_k_top_p(
|
||||
const size_t actualVocabSize,
|
||||
const int32_t * last_n_tokens_data,
|
||||
int last_n_tokens_size,
|
||||
const std::vector<float> logits,
|
||||
int top_k,
|
||||
double top_p,
|
||||
double temp,
|
||||
float repeat_penalty,
|
||||
std::mt19937 & rng);
|
||||
#ifdef NDEBUG
|
||||
# ifdef __has_builtin
|
||||
# if __has_builtin(__builtin_unreachable)
|
||||
# define UNREACHABLE() __builtin_unreachable()
|
||||
# else
|
||||
# define UNREACHABLE() do {} while (0)
|
||||
# endif
|
||||
# else
|
||||
# define UNREACHABLE() do {} while (0)
|
||||
# endif
|
||||
#else
|
||||
# define UNREACHABLE() assert(!"Unreachable statement was reached")
|
||||
#endif
|
||||
|
@ -9,6 +9,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
||||
### Added
|
||||
- Warn on Windows if the Microsoft Visual C++ runtime libraries are not found ([#2920](https://github.com/nomic-ai/gpt4all/pull/2920))
|
||||
|
||||
### Changed
|
||||
- Rebase llama.cpp on latest upstream as of September 26th ([#2998](https://github.com/nomic-ai/gpt4all/pull/2998))
|
||||
|
||||
## [2.8.2] - 2024-08-14
|
||||
|
||||
### Fixed
|
||||
|
@ -9,6 +9,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
||||
### Added
|
||||
- Add bm25 hybrid search to localdocs ([#2969](https://github.com/nomic-ai/gpt4all/pull/2969))
|
||||
|
||||
### Changed
|
||||
- Rebase llama.cpp on latest upstream as of September 26th ([#2998](https://github.com/nomic-ai/gpt4all/pull/2998))
|
||||
|
||||
### Fixed
|
||||
- Fix a crash when attempting to continue a chat loaded from disk ([#2995](https://github.com/nomic-ai/gpt4all/pull/2995))
|
||||
- Fix the local server rejecting min\_p/top\_p less than 1 ([#2996](https://github.com/nomic-ai/gpt4all/pull/2996))
|
||||
|
@ -71,19 +71,19 @@ bool ChatAPI::isModelLoaded() const
|
||||
// All three of the state virtual functions are handled custom inside of chatllm save/restore
|
||||
size_t ChatAPI::stateSize() const
|
||||
{
|
||||
return 0;
|
||||
throw std::logic_error("not implemented");
|
||||
}
|
||||
|
||||
size_t ChatAPI::saveState(uint8_t *dest) const
|
||||
size_t ChatAPI::saveState(std::span<uint8_t> dest) const
|
||||
{
|
||||
Q_UNUSED(dest);
|
||||
return 0;
|
||||
throw std::logic_error("not implemented");
|
||||
}
|
||||
|
||||
size_t ChatAPI::restoreState(const uint8_t *src)
|
||||
size_t ChatAPI::restoreState(std::span<const uint8_t> src)
|
||||
{
|
||||
Q_UNUSED(src);
|
||||
return 0;
|
||||
throw std::logic_error("not implemented");
|
||||
}
|
||||
|
||||
void ChatAPI::prompt(const std::string &prompt,
|
||||
|
@ -64,8 +64,8 @@ public:
|
||||
bool isModelLoaded() const override;
|
||||
size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
|
||||
size_t stateSize() const override;
|
||||
size_t saveState(uint8_t *dest) const override;
|
||||
size_t restoreState(const uint8_t *src) override;
|
||||
size_t saveState(std::span<uint8_t> dest) const override;
|
||||
size_t restoreState(std::span<const uint8_t> src) override;
|
||||
void prompt(const std::string &prompt,
|
||||
const std::string &promptTemplate,
|
||||
std::function<bool(int32_t)> promptCallback,
|
||||
@ -118,12 +118,14 @@ protected:
|
||||
throw std::logic_error("not implemented");
|
||||
}
|
||||
|
||||
Token sampleToken(PromptContext &ctx) const override
|
||||
void initSampler(PromptContext &ctx) override
|
||||
{
|
||||
(void)ctx;
|
||||
throw std::logic_error("not implemented");
|
||||
}
|
||||
|
||||
Token sampleToken() const override { throw std::logic_error("not implemented"); }
|
||||
|
||||
bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override
|
||||
{
|
||||
(void)ctx;
|
||||
|
@ -1174,7 +1174,13 @@ void ChatLLM::saveState()
|
||||
#if defined(DEBUG)
|
||||
qDebug() << "saveState" << m_llmThread.objectName() << "size:" << m_state.size();
|
||||
#endif
|
||||
m_llModelInfo.model->saveState(static_cast<uint8_t*>(reinterpret_cast<void*>(m_state.data())));
|
||||
bool ok = m_llModelInfo.model->saveState({reinterpret_cast<uint8_t *>(m_state.data()), size_t(m_state.size())});
|
||||
if (!ok) {
|
||||
// FIXME(jared): how badly does this situation break GPT4All?
|
||||
qWarning() << "ChatLLM failed to save LLModel state";
|
||||
m_state.clear();
|
||||
m_state.squeeze();
|
||||
}
|
||||
}
|
||||
|
||||
void ChatLLM::restoreState()
|
||||
@ -1183,7 +1189,7 @@ void ChatLLM::restoreState()
|
||||
return;
|
||||
|
||||
if (m_llModelType == LLModelType::API_) {
|
||||
QDataStream stream(&m_state, QIODeviceBase::ReadOnly);
|
||||
QDataStream stream(m_state);
|
||||
stream.setVersion(QDataStream::Qt_6_4);
|
||||
ChatAPI *chatAPI = static_cast<ChatAPI*>(m_llModelInfo.model.get());
|
||||
QList<QString> context;
|
||||
@ -1201,12 +1207,12 @@ void ChatLLM::restoreState()
|
||||
if (m_state.isEmpty())
|
||||
return;
|
||||
|
||||
if (m_llModelInfo.model->stateSize() == m_state.size()) {
|
||||
m_llModelInfo.model->restoreState(static_cast<const uint8_t*>(reinterpret_cast<void*>(m_state.data())));
|
||||
size_t bytesRead = m_llModelInfo.model->restoreState({reinterpret_cast<uint8_t *>(m_state.data()), size_t(m_state.size())});
|
||||
if (bytesRead) {
|
||||
m_processedSystemPrompt = true;
|
||||
m_pristineLoadedState = true;
|
||||
} else {
|
||||
qWarning() << "restoring state from text because" << m_llModelInfo.model->stateSize() << "!=" << m_state.size();
|
||||
qWarning() << "restoring state from text because of error reading state (mismatch or corrupt data)";
|
||||
m_restoreStateFromText = true;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user