mirror of
https://github.com/ravenscroftj/turbopilot.git
synced 2024-06-28 23:32:20 +00:00
add early prelit build
This commit is contained in:
parent
6fb323bb11
commit
df954e45bf
|
@ -42,7 +42,7 @@ struct ModelConfig
|
|||
float repeat_penalty = 1.10f;
|
||||
int32_t seed = -1; // RNG seed
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_batch = 64; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
};
|
||||
|
||||
class TurbopilotModel
|
||||
|
|
|
@ -64,16 +64,16 @@ class ReplitModel : public TurbopilotModel {
|
|||
|
||||
public:
|
||||
ReplitModel(ModelConfig config, std::mt19937 &rng) : TurbopilotModel(config, rng){
|
||||
this->model = new replit_model{};
|
||||
this->tokenizer = new replit_tokenizer{};
|
||||
model = replit_model{};
|
||||
vocab = replit_tokenizer{};
|
||||
}
|
||||
virtual ~ReplitModel();
|
||||
bool load_model(std::string path);
|
||||
virtual std::stringstream predict(std::string prompt, int max_length, bool include_prompt);
|
||||
|
||||
private:
|
||||
replit_model *model = NULL;
|
||||
replit_tokenizer *tokenizer = NULL;
|
||||
replit_model model;
|
||||
replit_tokenizer vocab;
|
||||
|
||||
|
||||
};
|
||||
|
|
|
@ -29,4 +29,4 @@ target_include_directories(${TURBOPILOT_TARGET} PRIVATE
|
|||
target_link_libraries(${TURBOPILOT_TARGET} PRIVATE ggml argparse)
|
||||
|
||||
|
||||
#target_link_libraries(${TURBOPILOT_TARGET} PRIVATE spdlog::spdlog_header_only)
|
||||
#target_link_libraries(${TURBOPILOT_TARGET} PRIVATE spdlog::spdlog_header_only)
|
||||
|
|
|
@ -642,5 +642,6 @@ std::stringstream GPTJModel::predict(std::string prompt, int max_length, bool in
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
21
src/main.cpp
21
src/main.cpp
|
@ -34,6 +34,11 @@ int main(int argc, char **argv)
|
|||
.scan<'i', int>();
|
||||
|
||||
|
||||
program.add_argument("-b", "--batch-size")
|
||||
.help("The number of tokens to process per batch. Defaults to 64")
|
||||
.default_value(64)
|
||||
.scan<'i', int>();
|
||||
|
||||
program.add_argument("-p", "--port")
|
||||
.help("The tcp port that turbopilot should listen on")
|
||||
.default_value(18080)
|
||||
|
@ -44,9 +49,15 @@ int main(int argc, char **argv)
|
|||
.default_value(-1)
|
||||
.scan<'i', int>();
|
||||
|
||||
program.add_argument("-v", "--verbose")
|
||||
.help("if set then output debug messages")
|
||||
.default_value(false)
|
||||
.implicit_value(true);
|
||||
|
||||
program.add_argument("prompt").remaining();
|
||||
|
||||
|
||||
|
||||
try
|
||||
{
|
||||
program.parse_args(argc, argv);
|
||||
|
@ -58,6 +69,15 @@ int main(int argc, char **argv)
|
|||
return 1;
|
||||
}
|
||||
|
||||
|
||||
auto verbose = program.get<bool>("-v");
|
||||
|
||||
if(verbose){
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
spdlog::debug("Set DEBUG=True");
|
||||
}
|
||||
|
||||
|
||||
ggml_time_init();
|
||||
|
||||
const int64_t t_main_start_us = ggml_time_us();
|
||||
|
@ -70,6 +90,7 @@ int main(int argc, char **argv)
|
|||
ModelConfig config{};
|
||||
std::mt19937 rng(program.get<int>("--random-seed"));
|
||||
|
||||
config.n_batch = program.get<int>("-b");
|
||||
config.n_threads = program.get<int>("--threads");
|
||||
|
||||
if(model_type.compare("codegen") == 0) {
|
||||
|
|
158
src/replit.cpp
158
src/replit.cpp
|
@ -1,9 +1,23 @@
|
|||
#include <ggml/ggml.h>
|
||||
#include <spdlog/spdlog.h>
|
||||
#include "ggml/ggml.h"
|
||||
|
||||
#include <turbopilot/replit.hpp>
|
||||
#include <fstream>
|
||||
#include <cstring>
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cinttypes>
|
||||
#include <cstddef>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define NOMINMAX
|
||||
|
@ -23,6 +37,10 @@ bool is_stdin_terminal() {
|
|||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
using piece_t = std::pair<std::size_t, float>;
|
||||
using piece_map_t = std::unordered_map<std::string, piece_t>;
|
||||
|
||||
|
||||
std::pair<std::vector<std::size_t>, float> encode_word(const std::string & word, const piece_map_t & model) {
|
||||
std::vector<int> best_segmentations_starts(word.length() + 1, -1);
|
||||
best_segmentations_starts[0] = 0;
|
||||
|
@ -66,7 +84,6 @@ std::pair<std::vector<std::size_t>, float> encode_word(const std::string & word,
|
|||
return std::make_pair(tokens, score);
|
||||
}
|
||||
|
||||
|
||||
bool replit_tokenizer_load(replit_tokenizer & tokenizer, std::istream & fin, int max_vocab_size) {
|
||||
std::string word;
|
||||
std::vector<char> buf(128);
|
||||
|
@ -126,9 +143,7 @@ std::string replit_tokenizer_detokenize(replit_tokenizer & tokenizer, const std:
|
|||
|
||||
|
||||
// load the model's weights from a file
|
||||
|
||||
bool ReplitModel::load_model(const std::string fname) {
|
||||
|
||||
bool replit_model_load(const std::string & fname, replit_model & model, replit_tokenizer & vocab) {
|
||||
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
|
@ -149,7 +164,7 @@ bool ReplitModel::load_model(const std::string fname) {
|
|||
|
||||
// load hparams
|
||||
{
|
||||
auto & hparams = model->hparams;
|
||||
auto & hparams = model.hparams;
|
||||
|
||||
fin.read((char *)&hparams.d_model, sizeof(hparams.d_model));
|
||||
fin.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len));
|
||||
|
@ -172,24 +187,24 @@ bool ReplitModel::load_model(const std::string fname) {
|
|||
}
|
||||
|
||||
// load vocab
|
||||
replit_tokenizer_load((*tokenizer), fin, model->hparams.n_vocab);
|
||||
replit_tokenizer_load(vocab, fin, model.hparams.n_vocab);
|
||||
|
||||
// for the big tensors, we have the option to store the data in 16-bit
|
||||
// floats or quantized in order to save memory and also to speed up the
|
||||
// computation
|
||||
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(model->hparams.ftype));
|
||||
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(model.hparams.ftype));
|
||||
if (wtype == GGML_TYPE_COUNT) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", __func__, fname.c_str(),
|
||||
model->hparams.ftype);
|
||||
model.hparams.ftype);
|
||||
return false;
|
||||
}
|
||||
|
||||
auto & ctx = model->ctx;
|
||||
auto & ctx = model.ctx;
|
||||
|
||||
size_t ctx_size = 0;
|
||||
|
||||
{
|
||||
const auto & hparams = model->hparams;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.d_model;
|
||||
const int n_layer = hparams.n_layers;
|
||||
|
@ -209,7 +224,7 @@ bool ReplitModel::load_model(const std::string fname) {
|
|||
ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_k
|
||||
ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_v
|
||||
|
||||
ctx_size += (1 + 6 * n_layer) * 512; // object overhead
|
||||
ctx_size += (1 + 6 * n_layer) * 1024; // object overhead
|
||||
|
||||
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size / (1024.0 * 1024.0));
|
||||
}
|
||||
|
@ -222,8 +237,8 @@ bool ReplitModel::load_model(const std::string fname) {
|
|||
/*.no_alloc =*/ false,
|
||||
};
|
||||
|
||||
model->ctx = ggml_init(params);
|
||||
if (!model->ctx) {
|
||||
model.ctx = ggml_init(params);
|
||||
if (!model.ctx) {
|
||||
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
@ -231,23 +246,23 @@ bool ReplitModel::load_model(const std::string fname) {
|
|||
|
||||
// prepare memory for the weights
|
||||
{
|
||||
const auto & hparams = model->hparams;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const size_t n_embd = hparams.d_model;
|
||||
const size_t n_layer = hparams.n_layers;
|
||||
const size_t n_vocab = hparams.n_vocab;
|
||||
|
||||
model->layers.resize(n_layer);
|
||||
model.layers.resize(n_layer);
|
||||
|
||||
model->wte_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
||||
model->norm_f_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
model.wte_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
||||
model.norm_f_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
// map by name
|
||||
model->tensors["transformer.wte.weight"] = model->wte_weight;
|
||||
model->tensors["transformer.norm_f.weight"] = model->norm_f_weight;
|
||||
model.tensors["transformer.wte.weight"] = model.wte_weight;
|
||||
model.tensors["transformer.norm_f.weight"] = model.norm_f_weight;
|
||||
|
||||
for (int i = 0; i < (int)n_layer; ++i) {
|
||||
auto & layer = model->layers[i];
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.norm_1_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
layer.c_attn_wqkv_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, 3 * n_embd);
|
||||
|
@ -257,19 +272,19 @@ bool ReplitModel::load_model(const std::string fname) {
|
|||
layer.ffn_down_proj = ggml_new_tensor_2d(ctx, wtype, 4 * n_embd, n_embd);
|
||||
|
||||
// map by name
|
||||
model->tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"] = layer.norm_1_weight;
|
||||
model->tensors["transformer.blocks." + std::to_string(i) + ".attn.Wqkv.weight"] = layer.c_attn_wqkv_weight;
|
||||
model->tensors["transformer.blocks." + std::to_string(i) + ".attn.out_proj.weight"] =
|
||||
model.tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"] = layer.norm_1_weight;
|
||||
model.tensors["transformer.blocks." + std::to_string(i) + ".attn.Wqkv.weight"] = layer.c_attn_wqkv_weight;
|
||||
model.tensors["transformer.blocks." + std::to_string(i) + ".attn.out_proj.weight"] =
|
||||
layer.c_attn_out_proj_weight;
|
||||
model->tensors["transformer.blocks." + std::to_string(i) + ".norm_2.weight"] = layer.norm_2_weight;
|
||||
model->tensors["transformer.blocks." + std::to_string(i) + ".ffn.up_proj.weight"] = layer.ffn_up_proj;
|
||||
model->tensors["transformer.blocks." + std::to_string(i) + ".ffn.down_proj.weight"] = layer.ffn_down_proj;
|
||||
model.tensors["transformer.blocks." + std::to_string(i) + ".norm_2.weight"] = layer.norm_2_weight;
|
||||
model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.up_proj.weight"] = layer.ffn_up_proj;
|
||||
model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.down_proj.weight"] = layer.ffn_down_proj;
|
||||
}
|
||||
}
|
||||
|
||||
// key + value memory
|
||||
{
|
||||
const auto & hparams = model->hparams;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.d_model;
|
||||
const int n_layer = hparams.n_layers;
|
||||
|
@ -278,10 +293,10 @@ bool ReplitModel::load_model(const std::string fname) {
|
|||
const int64_t n_mem = n_layer * n_ctx;
|
||||
const int64_t n_elements = n_embd * n_mem;
|
||||
|
||||
model->memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
||||
model->memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
||||
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
||||
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
||||
|
||||
const size_t memory_size = ggml_nbytes(model->memory_k) + ggml_nbytes(model->memory_v);
|
||||
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
||||
|
||||
printf("%s: memory_size = %8.2f MB, n_mem = %" PRIu64 "\n", __func__, memory_size / 1024.0 / 1024.0, n_mem);
|
||||
}
|
||||
|
@ -316,12 +331,12 @@ bool ReplitModel::load_model(const std::string fname) {
|
|||
std::string name(length, 0);
|
||||
fin.read(&name[0], length);
|
||||
|
||||
if (model->tensors.find(name.data()) == model->tensors.end()) {
|
||||
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
||||
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto tensor = model->tensors[name.data()];
|
||||
auto tensor = model.tensors[name.data()];
|
||||
if (ggml_nelements(tensor) != nelements) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
||||
return false;
|
||||
|
@ -370,7 +385,6 @@ bool ReplitModel::load_model(const std::string fname) {
|
|||
return true;
|
||||
}
|
||||
|
||||
|
||||
// evaluate the transformer
|
||||
//
|
||||
// - model: the model
|
||||
|
@ -586,43 +600,54 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa
|
|||
}
|
||||
|
||||
|
||||
|
||||
ReplitModel::~ReplitModel(){
|
||||
ggml_free(model->ctx);
|
||||
free(model);
|
||||
free(tokenizer);
|
||||
ggml_free(model.ctx);
|
||||
}
|
||||
|
||||
|
||||
std::stringstream ReplitModel::predict(std::string prompt, int max_length, bool include_prompt) {
|
||||
|
||||
std::stringstream result;
|
||||
// tokenize the prompt
|
||||
std::vector<std::size_t> embd_inp = replit_tokenizer_tokenize((*tokenizer), prompt);
|
||||
|
||||
|
||||
int n_past = 0;
|
||||
|
||||
int64_t t_sample_us = 0;
|
||||
int64_t t_sample_us = 0;
|
||||
int64_t t_predict_us = 0;
|
||||
|
||||
int n_predict = std::min(max_length, model->hparams.max_seq_len - (int) embd_inp.size());
|
||||
std::vector<float> logits;
|
||||
|
||||
|
||||
spdlog::debug("{}: number of tokens in prompt = {}", __func__, embd_inp.size());
|
||||
// tokenize the prompt
|
||||
std::vector<std::size_t> embd_inp = replit_tokenizer_tokenize(vocab, prompt);
|
||||
|
||||
printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
|
||||
// for (int i = 0; i < embd_inp.size(); i++) {
|
||||
// printf("%s: token[%d] = %6zu\n", __func__, i, embd_inp[i]);
|
||||
// // vocab.id_to_token.at(embd_inp[i]).c_str()
|
||||
// }
|
||||
printf("\n");
|
||||
|
||||
int n_predict = std::min(max_length, model.hparams.max_seq_len - (int) embd_inp.size() - 1);
|
||||
|
||||
spdlog::debug("{}: Number of characters to predict: {}", __func__, n_predict);
|
||||
|
||||
std::vector<gpt_vocab::id> embd;
|
||||
|
||||
// determine the required inference memory per token:
|
||||
size_t mem_per_token = 0;
|
||||
replit_eval(model, config.n_threads, 0, {0, 1, 2, 3}, logits, false, mem_per_token);
|
||||
|
||||
std::vector<float> logits;
|
||||
|
||||
replit_eval((*model), config.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token);
|
||||
spdlog::debug("{}: mem per token: {}", __func__, mem_per_token);
|
||||
|
||||
for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) {
|
||||
// predict
|
||||
if (embd.size() > 0) {
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
|
||||
if (!replit_eval((*model), config.n_threads, n_past, embd, logits, false, mem_per_token)) {
|
||||
if (!replit_eval(model, config.n_threads, n_past, embd, logits, false, mem_per_token)) {
|
||||
throw std::runtime_error("Failed to predict");
|
||||
}
|
||||
|
||||
|
@ -634,39 +659,29 @@ std::stringstream ReplitModel::predict(std::string prompt, int max_length, bool
|
|||
|
||||
if (i >= embd_inp.size()) {
|
||||
// sample next token
|
||||
const int top_k = config.top_k;
|
||||
const int top_k = config.top_k;
|
||||
const float top_p = config.top_p;
|
||||
const float temp = config.temp;
|
||||
const float temp = config.temp;
|
||||
|
||||
const int n_vocab = model->hparams.n_vocab;
|
||||
const int n_vocab = model.hparams.n_vocab;
|
||||
|
||||
gpt_vocab::id id = 0;
|
||||
|
||||
{
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
id = gpt_sample_top_k_top_p(tokenizer->raw_vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
|
||||
id = gpt_sample_top_k_top_p(vocab.raw_vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p,
|
||||
temp, rng);
|
||||
|
||||
t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
|
||||
// add it to the context
|
||||
embd.push_back(id);
|
||||
|
||||
// if(id != 50256){
|
||||
// result << vocab->id_to_token[id].c_str();
|
||||
// }
|
||||
|
||||
} else {
|
||||
// if here, it means we are still processing the input prompt
|
||||
for (int k = i; k < embd_inp.size(); k++) {
|
||||
embd.push_back(embd_inp[k]);
|
||||
|
||||
if(include_prompt){
|
||||
result << replit_tokenizer_detokenize((*tokenizer), {static_cast<std::size_t>(embd_inp[k])});
|
||||
//result << vocab->id_to_token[embd_inp[k]].c_str();
|
||||
}
|
||||
|
||||
if (embd.size() > config.n_batch) {
|
||||
break;
|
||||
}
|
||||
|
@ -674,11 +689,26 @@ std::stringstream ReplitModel::predict(std::string prompt, int max_length, bool
|
|||
i += embd.size() - 1;
|
||||
}
|
||||
|
||||
// display text
|
||||
for (auto id : embd) {
|
||||
//result << replit_tokenizer_detokenize(vocab, {static_cast<std::size_t>(id)}).c_str();
|
||||
//printf("%s", replit_tokenizer_detokenize(vocab, {static_cast<std::size_t>(id)}).c_str());
|
||||
}
|
||||
//fflush(stdout);
|
||||
|
||||
// end of text token
|
||||
if (embd.back() == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ggml_free(model.ctx);
|
||||
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
bool ReplitModel::load_model(std::string fname){
|
||||
return replit_model_load(fname, model, vocab);
|
||||
}
|
Loading…
Reference in New Issue
Block a user