mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2024-10-01 01:06:10 -04:00
bump llama.cpp version + needed fixes for that
This commit is contained in:
parent
33c22be2aa
commit
0bc2274869
@ -100,6 +100,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
|||||||
|
|
||||||
add_library(replit-mainline-${BUILD_VARIANT} SHARED
|
add_library(replit-mainline-${BUILD_VARIANT} SHARED
|
||||||
replit.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
|
replit.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
|
||||||
|
target_compile_definitions(replit-mainline-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
|
||||||
prepare_target(replit-mainline llama-mainline)
|
prepare_target(replit-mainline llama-mainline)
|
||||||
|
|
||||||
if (NOT LLAMA_METAL)
|
if (NOT LLAMA_METAL)
|
||||||
@ -120,6 +121,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
|||||||
|
|
||||||
add_library(falcon-${BUILD_VARIANT} SHARED
|
add_library(falcon-${BUILD_VARIANT} SHARED
|
||||||
falcon.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
|
falcon.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
|
||||||
|
target_compile_definitions(falcon-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
|
||||||
prepare_target(falcon llama-mainline)
|
prepare_target(falcon llama-mainline)
|
||||||
|
|
||||||
add_library(mpt-${BUILD_VARIANT} SHARED
|
add_library(mpt-${BUILD_VARIANT} SHARED
|
||||||
@ -128,6 +130,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
|||||||
|
|
||||||
add_library(bert-${BUILD_VARIANT} SHARED
|
add_library(bert-${BUILD_VARIANT} SHARED
|
||||||
bert.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
|
bert.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
|
||||||
|
target_compile_definitions(bert-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
|
||||||
prepare_target(bert llama-mainline)
|
prepare_target(bert llama-mainline)
|
||||||
|
|
||||||
add_library(starcoder-${BUILD_VARIANT} SHARED
|
add_library(starcoder-${BUILD_VARIANT} SHARED
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#define BERT_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
#define BERT_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
#include "bert_impl.h"
|
#include "bert_impl.h"
|
||||||
|
#include "llmodel_shared.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
@ -91,22 +92,6 @@ struct bert_model
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
|
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
|
||||||
struct bert_buffer {
|
|
||||||
uint8_t * data = NULL;
|
|
||||||
size_t size = 0;
|
|
||||||
|
|
||||||
void resize(size_t size) {
|
|
||||||
delete[] data;
|
|
||||||
data = new uint8_t[size];
|
|
||||||
this->size = size;
|
|
||||||
}
|
|
||||||
|
|
||||||
~bert_buffer() {
|
|
||||||
delete[] data;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
struct bert_ctx
|
struct bert_ctx
|
||||||
{
|
{
|
||||||
bert_model model;
|
bert_model model;
|
||||||
@ -115,7 +100,8 @@ struct bert_ctx
|
|||||||
size_t mem_per_token;
|
size_t mem_per_token;
|
||||||
int64_t mem_per_input;
|
int64_t mem_per_input;
|
||||||
int32_t max_batch_n;
|
int32_t max_batch_n;
|
||||||
bert_buffer buf_compute;
|
llm_buffer buf_compute;
|
||||||
|
llm_buffer work_buf;
|
||||||
};
|
};
|
||||||
|
|
||||||
int32_t bert_n_embd(bert_ctx * ctx)
|
int32_t bert_n_embd(bert_ctx * ctx)
|
||||||
@ -328,13 +314,12 @@ void bert_eval(
|
|||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
.mem_size = buf_compute.size,
|
.mem_size = buf_compute.size,
|
||||||
.mem_buffer = buf_compute.data,
|
.mem_buffer = buf_compute.addr,
|
||||||
.no_alloc = false,
|
.no_alloc = false,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_context *ctx0 = ggml_init(params);
|
struct ggml_context *ctx0 = ggml_init(params);
|
||||||
struct ggml_cgraph gf = {};
|
struct ggml_cgraph gf = {};
|
||||||
gf.n_threads = n_threads;
|
|
||||||
|
|
||||||
// Embeddings. word_embeddings + token_type_embeddings + position_embeddings
|
// Embeddings. word_embeddings + token_type_embeddings + position_embeddings
|
||||||
struct ggml_tensor *token_layer = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
struct ggml_tensor *token_layer = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||||
@ -466,7 +451,9 @@ void bert_eval(
|
|||||||
ggml_tensor *output = inpL;
|
ggml_tensor *output = inpL;
|
||||||
// run the computation
|
// run the computation
|
||||||
ggml_build_forward_expand(&gf, output);
|
ggml_build_forward_expand(&gf, output);
|
||||||
ggml_graph_compute(ctx0, &gf);
|
//ggml_graph_compute_g4a()
|
||||||
|
ggml_graph_compute_g4a(ctx->work_buf, &gf, n_threads);
|
||||||
|
//ggml_graph_compute(ctx0, &gf);
|
||||||
|
|
||||||
|
|
||||||
// float *dat = ggml_get_data_f32(output);
|
// float *dat = ggml_get_data_f32(output);
|
||||||
@ -633,7 +620,7 @@ struct bert_ctx * bert_load_from_file(const char *fname)
|
|||||||
model_mem_req += n_layer * (n_intermediate * ggml_type_sizef(GGML_TYPE_F32)); // ff_i_b
|
model_mem_req += n_layer * (n_intermediate * ggml_type_sizef(GGML_TYPE_F32)); // ff_i_b
|
||||||
model_mem_req += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ff_o_b
|
model_mem_req += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ff_o_b
|
||||||
|
|
||||||
model_mem_req += (5 + 16 * n_layer) * 256; // object overhead
|
model_mem_req += (5 + 16 * n_layer) * ggml_tensor_overhead(); // object overhead
|
||||||
|
|
||||||
#if defined(DEBUG_BERT)
|
#if defined(DEBUG_BERT)
|
||||||
printf("%s: ggml ctx size = %6.2f MB\n", __func__, model_mem_req / (1024.0 * 1024.0));
|
printf("%s: ggml ctx size = %6.2f MB\n", __func__, model_mem_req / (1024.0 * 1024.0));
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "ggml.h"
|
||||||
#define FALCON_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
#define FALCON_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
#include "falcon_impl.h"
|
#include "falcon_impl.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
@ -64,6 +65,7 @@ struct falcon_model {
|
|||||||
std::map<std::string, struct ggml_tensor*> tensors;
|
std::map<std::string, struct ggml_tensor*> tensors;
|
||||||
|
|
||||||
llm_buffer eval_buf;
|
llm_buffer eval_buf;
|
||||||
|
llm_buffer work_buf;
|
||||||
llm_buffer scr0_buf;
|
llm_buffer scr0_buf;
|
||||||
llm_buffer scr1_buf;
|
llm_buffer scr1_buf;
|
||||||
};
|
};
|
||||||
@ -446,7 +448,7 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
|
|||||||
// - embd_w: the predicted logits for the next token
|
// - embd_w: the predicted logits for the next token
|
||||||
//
|
//
|
||||||
bool falcon_eval(
|
bool falcon_eval(
|
||||||
const falcon_model & model,
|
falcon_model & model,
|
||||||
const int n_threads,
|
const int n_threads,
|
||||||
const int n_past,
|
const int n_past,
|
||||||
const std::vector<gpt_vocab::id> & embd_inp,
|
const std::vector<gpt_vocab::id> & embd_inp,
|
||||||
@ -473,7 +475,6 @@ bool falcon_eval(
|
|||||||
|
|
||||||
struct ggml_context * ctx0 = ggml_init(eval_ctx_params);
|
struct ggml_context * ctx0 = ggml_init(eval_ctx_params);
|
||||||
struct ggml_cgraph gf = {};
|
struct ggml_cgraph gf = {};
|
||||||
gf.n_threads = n_threads;
|
|
||||||
|
|
||||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||||
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
||||||
@ -546,8 +547,8 @@ bool falcon_eval(
|
|||||||
head_dim * (n_head + n_head_kv) * sizeof_wtype);
|
head_dim * (n_head + n_head_kv) * sizeof_wtype);
|
||||||
|
|
||||||
// using mode = 2 for neox mode
|
// using mode = 2 for neox mode
|
||||||
Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, head_dim, 2);
|
Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, head_dim, 2, n_ctx);
|
||||||
Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, head_dim, 2);
|
Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, head_dim, 2, n_ctx);
|
||||||
|
|
||||||
// store key and value to memory
|
// store key and value to memory
|
||||||
{
|
{
|
||||||
@ -678,7 +679,8 @@ bool falcon_eval(
|
|||||||
|
|
||||||
// run the computation
|
// run the computation
|
||||||
ggml_build_forward_expand(&gf, inpL);
|
ggml_build_forward_expand(&gf, inpL);
|
||||||
ggml_graph_compute (ctx0, &gf);
|
ggml_graph_compute_g4a(model.work_buf, &gf, n_threads);
|
||||||
|
|
||||||
|
|
||||||
//if (n_past%100 == 0) {
|
//if (n_past%100 == 0) {
|
||||||
// ggml_graph_print (&gf);
|
// ggml_graph_print (&gf);
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit da760ac3829a89ab9d60ec797df8a570b9b8419a
|
Subproject commit 697966680b27d9b4f05668605b863cb9aea3e15f
|
@ -1,6 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
|
#include <vector>
|
||||||
#include <ggml.h>
|
#include <ggml.h>
|
||||||
|
|
||||||
struct llm_buffer {
|
struct llm_buffer {
|
||||||
@ -34,3 +35,14 @@ struct llm_kv_cache {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#if LLAMA_DATE >= 230519
|
||||||
|
inline void ggml_graph_compute_g4a(llm_buffer& buf, ggml_cgraph * graph, int n_threads) {
|
||||||
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||||
|
if (plan.work_size > 0) {
|
||||||
|
buf.resize(plan.work_size);
|
||||||
|
plan.work_data = buf.addr;
|
||||||
|
}
|
||||||
|
ggml_graph_compute(graph, &plan);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
@ -196,6 +196,7 @@ struct replit_model {
|
|||||||
|
|
||||||
struct ggml_context * ctx;
|
struct ggml_context * ctx;
|
||||||
llm_buffer eval_buf;
|
llm_buffer eval_buf;
|
||||||
|
llm_buffer work_buf;
|
||||||
llm_buffer scr0_buf;
|
llm_buffer scr0_buf;
|
||||||
llm_buffer scr1_buf;
|
llm_buffer scr1_buf;
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
@ -490,7 +491,7 @@ bool replit_model_load(const std::string & fname, std::istream &fin, replit_mode
|
|||||||
model.scr1_buf.resize(256u * 1024 * 1024);
|
model.scr1_buf.resize(256u * 1024 * 1024);
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
model.ctx_metal = ggml_metal_init();
|
model.ctx_metal = ggml_metal_init(1);
|
||||||
void* data_ptr = ggml_get_mem_buffer(model.ctx);
|
void* data_ptr = ggml_get_mem_buffer(model.ctx);
|
||||||
size_t data_size = ggml_get_mem_size(model.ctx);
|
size_t data_size = ggml_get_mem_size(model.ctx);
|
||||||
const size_t max_size = ggml_get_max_tensor_size(model.ctx);
|
const size_t max_size = ggml_get_max_tensor_size(model.ctx);
|
||||||
@ -534,7 +535,7 @@ bool replit_model_load(const std::string & fname, replit_model & model, replit_t
|
|||||||
// - embd_inp: the embeddings of the tokens in the context
|
// - embd_inp: the embeddings of the tokens in the context
|
||||||
// - embd_w: the predicted logits for the next token
|
// - embd_w: the predicted logits for the next token
|
||||||
//
|
//
|
||||||
bool replit_eval(const replit_model & model, const int n_threads, const int n_past,
|
bool replit_eval(replit_model & model, const int n_threads, const int n_past,
|
||||||
const std::vector<gpt_vocab::id> & embd_inp, std::vector<float> & embd_w, size_t & mem_per_token) {
|
const std::vector<gpt_vocab::id> & embd_inp, std::vector<float> & embd_w, size_t & mem_per_token) {
|
||||||
const int N = embd_inp.size();
|
const int N = embd_inp.size();
|
||||||
|
|
||||||
@ -552,7 +553,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa
|
|||||||
.no_alloc = false,
|
.no_alloc = false,
|
||||||
};
|
};
|
||||||
struct ggml_context * ctx0 = ggml_init(eval_ctx_params);
|
struct ggml_context * ctx0 = ggml_init(eval_ctx_params);
|
||||||
struct ggml_cgraph gf = {.n_threads = n_threads};
|
struct ggml_cgraph gf = {};
|
||||||
|
|
||||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||||
memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd));
|
memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd));
|
||||||
@ -706,10 +707,10 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa
|
|||||||
ggml_metal_get_tensor(model.ctx_metal, model.kv_self.k);
|
ggml_metal_get_tensor(model.ctx_metal, model.kv_self.k);
|
||||||
ggml_metal_get_tensor(model.ctx_metal, model.kv_self.v);
|
ggml_metal_get_tensor(model.ctx_metal, model.kv_self.v);
|
||||||
|
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute_g4a(model.work_buf, &gf, n_threads);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute_g4a(model.work_buf, &gf, n_threads);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// std::cout << "Qcur" << std::endl;
|
// std::cout << "Qcur" << std::endl;
|
||||||
|
Loading…
Reference in New Issue
Block a user