add early prelit build

This commit is contained in:
James Ravenscroft 2023-08-10 14:45:23 +01:00
parent 6fb323bb11
commit df954e45bf
6 changed files with 122 additions and 70 deletions

View File

@ -42,7 +42,7 @@ struct ModelConfig
float repeat_penalty = 1.10f;
int32_t seed = -1; // RNG seed
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_batch = 64; // batch size for prompt processing (must be >=32 to use BLAS)
};
class TurbopilotModel

View File

@ -64,16 +64,16 @@ class ReplitModel : public TurbopilotModel {
public:
ReplitModel(ModelConfig config, std::mt19937 &rng) : TurbopilotModel(config, rng){
this->model = new replit_model{};
this->tokenizer = new replit_tokenizer{};
model = replit_model{};
vocab = replit_tokenizer{};
}
virtual ~ReplitModel();
bool load_model(std::string path);
virtual std::stringstream predict(std::string prompt, int max_length, bool include_prompt);
private:
replit_model *model = NULL;
replit_tokenizer *tokenizer = NULL;
replit_model model;
replit_tokenizer vocab;
};

View File

@ -29,4 +29,4 @@ target_include_directories(${TURBOPILOT_TARGET} PRIVATE
target_link_libraries(${TURBOPILOT_TARGET} PRIVATE ggml argparse)
#target_link_libraries(${TURBOPILOT_TARGET} PRIVATE spdlog::spdlog_header_only)
#target_link_libraries(${TURBOPILOT_TARGET} PRIVATE spdlog::spdlog_header_only)

View File

@ -642,5 +642,6 @@ std::stringstream GPTJModel::predict(std::string prompt, int max_length, bool in
}
}
return result;
}

View File

@ -34,6 +34,11 @@ int main(int argc, char **argv)
.scan<'i', int>();
program.add_argument("-b", "--batch-size")
.help("The number of tokens to process per batch. Defaults to 64")
.default_value(64)
.scan<'i', int>();
program.add_argument("-p", "--port")
.help("The tcp port that turbopilot should listen on")
.default_value(18080)
@ -44,9 +49,15 @@ int main(int argc, char **argv)
.default_value(-1)
.scan<'i', int>();
program.add_argument("-v", "--verbose")
.help("if set then output debug messages")
.default_value(false)
.implicit_value(true);
program.add_argument("prompt").remaining();
try
{
program.parse_args(argc, argv);
@ -58,6 +69,15 @@ int main(int argc, char **argv)
return 1;
}
auto verbose = program.get<bool>("-v");
if(verbose){
spdlog::set_level(spdlog::level::debug);
spdlog::debug("Set DEBUG=True");
}
ggml_time_init();
const int64_t t_main_start_us = ggml_time_us();
@ -70,6 +90,7 @@ int main(int argc, char **argv)
ModelConfig config{};
std::mt19937 rng(program.get<int>("--random-seed"));
config.n_batch = program.get<int>("-b");
config.n_threads = program.get<int>("--threads");
if(model_type.compare("codegen") == 0) {

View File

@ -1,9 +1,23 @@
#include <ggml/ggml.h>
#include <spdlog/spdlog.h>
#include "ggml/ggml.h"
#include <turbopilot/replit.hpp>
#include <fstream>
#include <cstring>
#include <cassert>
#include <cmath>
#include <cinttypes>
#include <cstddef>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <stdint.h>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include <spdlog/spdlog.h>
#if defined(_WIN32)
#define NOMINMAX
@ -23,6 +37,10 @@ bool is_stdin_terminal() {
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
using piece_t = std::pair<std::size_t, float>;
using piece_map_t = std::unordered_map<std::string, piece_t>;
std::pair<std::vector<std::size_t>, float> encode_word(const std::string & word, const piece_map_t & model) {
std::vector<int> best_segmentations_starts(word.length() + 1, -1);
best_segmentations_starts[0] = 0;
@ -66,7 +84,6 @@ std::pair<std::vector<std::size_t>, float> encode_word(const std::string & word,
return std::make_pair(tokens, score);
}
bool replit_tokenizer_load(replit_tokenizer & tokenizer, std::istream & fin, int max_vocab_size) {
std::string word;
std::vector<char> buf(128);
@ -126,9 +143,7 @@ std::string replit_tokenizer_detokenize(replit_tokenizer & tokenizer, const std:
// load the model's weights from a file
bool ReplitModel::load_model(const std::string fname) {
bool replit_model_load(const std::string & fname, replit_model & model, replit_tokenizer & vocab) {
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
auto fin = std::ifstream(fname, std::ios::binary);
@ -149,7 +164,7 @@ bool ReplitModel::load_model(const std::string fname) {
// load hparams
{
auto & hparams = model->hparams;
auto & hparams = model.hparams;
fin.read((char *)&hparams.d_model, sizeof(hparams.d_model));
fin.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len));
@ -172,24 +187,24 @@ bool ReplitModel::load_model(const std::string fname) {
}
// load vocab
replit_tokenizer_load((*tokenizer), fin, model->hparams.n_vocab);
replit_tokenizer_load(vocab, fin, model.hparams.n_vocab);
// for the big tensors, we have the option to store the data in 16-bit
// floats or quantized in order to save memory and also to speed up the
// computation
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(model->hparams.ftype));
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(model.hparams.ftype));
if (wtype == GGML_TYPE_COUNT) {
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", __func__, fname.c_str(),
model->hparams.ftype);
model.hparams.ftype);
return false;
}
auto & ctx = model->ctx;
auto & ctx = model.ctx;
size_t ctx_size = 0;
{
const auto & hparams = model->hparams;
const auto & hparams = model.hparams;
const int n_embd = hparams.d_model;
const int n_layer = hparams.n_layers;
@ -209,7 +224,7 @@ bool ReplitModel::load_model(const std::string fname) {
ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_k
ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_v
ctx_size += (1 + 6 * n_layer) * 512; // object overhead
ctx_size += (1 + 6 * n_layer) * 1024; // object overhead
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size / (1024.0 * 1024.0));
}
@ -222,8 +237,8 @@ bool ReplitModel::load_model(const std::string fname) {
/*.no_alloc =*/ false,
};
model->ctx = ggml_init(params);
if (!model->ctx) {
model.ctx = ggml_init(params);
if (!model.ctx) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false;
}
@ -231,23 +246,23 @@ bool ReplitModel::load_model(const std::string fname) {
// prepare memory for the weights
{
const auto & hparams = model->hparams;
const auto & hparams = model.hparams;
const size_t n_embd = hparams.d_model;
const size_t n_layer = hparams.n_layers;
const size_t n_vocab = hparams.n_vocab;
model->layers.resize(n_layer);
model.layers.resize(n_layer);
model->wte_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
model->norm_f_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
model.wte_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
model.norm_f_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
// map by name
model->tensors["transformer.wte.weight"] = model->wte_weight;
model->tensors["transformer.norm_f.weight"] = model->norm_f_weight;
model.tensors["transformer.wte.weight"] = model.wte_weight;
model.tensors["transformer.norm_f.weight"] = model.norm_f_weight;
for (int i = 0; i < (int)n_layer; ++i) {
auto & layer = model->layers[i];
auto & layer = model.layers[i];
layer.norm_1_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.c_attn_wqkv_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, 3 * n_embd);
@ -257,19 +272,19 @@ bool ReplitModel::load_model(const std::string fname) {
layer.ffn_down_proj = ggml_new_tensor_2d(ctx, wtype, 4 * n_embd, n_embd);
// map by name
model->tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"] = layer.norm_1_weight;
model->tensors["transformer.blocks." + std::to_string(i) + ".attn.Wqkv.weight"] = layer.c_attn_wqkv_weight;
model->tensors["transformer.blocks." + std::to_string(i) + ".attn.out_proj.weight"] =
model.tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"] = layer.norm_1_weight;
model.tensors["transformer.blocks." + std::to_string(i) + ".attn.Wqkv.weight"] = layer.c_attn_wqkv_weight;
model.tensors["transformer.blocks." + std::to_string(i) + ".attn.out_proj.weight"] =
layer.c_attn_out_proj_weight;
model->tensors["transformer.blocks." + std::to_string(i) + ".norm_2.weight"] = layer.norm_2_weight;
model->tensors["transformer.blocks." + std::to_string(i) + ".ffn.up_proj.weight"] = layer.ffn_up_proj;
model->tensors["transformer.blocks." + std::to_string(i) + ".ffn.down_proj.weight"] = layer.ffn_down_proj;
model.tensors["transformer.blocks." + std::to_string(i) + ".norm_2.weight"] = layer.norm_2_weight;
model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.up_proj.weight"] = layer.ffn_up_proj;
model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.down_proj.weight"] = layer.ffn_down_proj;
}
}
// key + value memory
{
const auto & hparams = model->hparams;
const auto & hparams = model.hparams;
const int n_embd = hparams.d_model;
const int n_layer = hparams.n_layers;
@ -278,10 +293,10 @@ bool ReplitModel::load_model(const std::string fname) {
const int64_t n_mem = n_layer * n_ctx;
const int64_t n_elements = n_embd * n_mem;
model->memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
model->memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
const size_t memory_size = ggml_nbytes(model->memory_k) + ggml_nbytes(model->memory_v);
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
printf("%s: memory_size = %8.2f MB, n_mem = %" PRIu64 "\n", __func__, memory_size / 1024.0 / 1024.0, n_mem);
}
@ -316,12 +331,12 @@ bool ReplitModel::load_model(const std::string fname) {
std::string name(length, 0);
fin.read(&name[0], length);
if (model->tensors.find(name.data()) == model->tensors.end()) {
if (model.tensors.find(name.data()) == model.tensors.end()) {
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
return false;
}
auto tensor = model->tensors[name.data()];
auto tensor = model.tensors[name.data()];
if (ggml_nelements(tensor) != nelements) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
return false;
@ -370,7 +385,6 @@ bool ReplitModel::load_model(const std::string fname) {
return true;
}
// evaluate the transformer
//
// - model: the model
@ -586,43 +600,54 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa
}
ReplitModel::~ReplitModel(){
ggml_free(model->ctx);
free(model);
free(tokenizer);
ggml_free(model.ctx);
}
std::stringstream ReplitModel::predict(std::string prompt, int max_length, bool include_prompt) {
std::stringstream result;
// tokenize the prompt
std::vector<std::size_t> embd_inp = replit_tokenizer_tokenize((*tokenizer), prompt);
int n_past = 0;
int64_t t_sample_us = 0;
int64_t t_sample_us = 0;
int64_t t_predict_us = 0;
int n_predict = std::min(max_length, model->hparams.max_seq_len - (int) embd_inp.size());
std::vector<float> logits;
spdlog::debug("{}: number of tokens in prompt = {}", __func__, embd_inp.size());
// tokenize the prompt
std::vector<std::size_t> embd_inp = replit_tokenizer_tokenize(vocab, prompt);
printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
// for (int i = 0; i < embd_inp.size(); i++) {
// printf("%s: token[%d] = %6zu\n", __func__, i, embd_inp[i]);
// // vocab.id_to_token.at(embd_inp[i]).c_str()
// }
printf("\n");
int n_predict = std::min(max_length, model.hparams.max_seq_len - (int) embd_inp.size() - 1);
spdlog::debug("{}: Number of characters to predict: {}", __func__, n_predict);
std::vector<gpt_vocab::id> embd;
// determine the required inference memory per token:
size_t mem_per_token = 0;
replit_eval(model, config.n_threads, 0, {0, 1, 2, 3}, logits, false, mem_per_token);
std::vector<float> logits;
replit_eval((*model), config.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token);
spdlog::debug("{}: mem per token: {}", __func__, mem_per_token);
for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) {
// predict
if (embd.size() > 0) {
const int64_t t_start_us = ggml_time_us();
if (!replit_eval((*model), config.n_threads, n_past, embd, logits, false, mem_per_token)) {
if (!replit_eval(model, config.n_threads, n_past, embd, logits, false, mem_per_token)) {
throw std::runtime_error("Failed to predict");
}
@ -634,39 +659,29 @@ std::stringstream ReplitModel::predict(std::string prompt, int max_length, bool
if (i >= embd_inp.size()) {
// sample next token
const int top_k = config.top_k;
const int top_k = config.top_k;
const float top_p = config.top_p;
const float temp = config.temp;
const float temp = config.temp;
const int n_vocab = model->hparams.n_vocab;
const int n_vocab = model.hparams.n_vocab;
gpt_vocab::id id = 0;
{
const int64_t t_start_sample_us = ggml_time_us();
id = gpt_sample_top_k_top_p(tokenizer->raw_vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
id = gpt_sample_top_k_top_p(vocab.raw_vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p,
temp, rng);
t_sample_us += ggml_time_us() - t_start_sample_us;
}
// add it to the context
embd.push_back(id);
// if(id != 50256){
// result << vocab->id_to_token[id].c_str();
// }
} else {
// if here, it means we are still processing the input prompt
for (int k = i; k < embd_inp.size(); k++) {
embd.push_back(embd_inp[k]);
if(include_prompt){
result << replit_tokenizer_detokenize((*tokenizer), {static_cast<std::size_t>(embd_inp[k])});
//result << vocab->id_to_token[embd_inp[k]].c_str();
}
if (embd.size() > config.n_batch) {
break;
}
@ -674,11 +689,26 @@ std::stringstream ReplitModel::predict(std::string prompt, int max_length, bool
i += embd.size() - 1;
}
// display text
for (auto id : embd) {
//result << replit_tokenizer_detokenize(vocab, {static_cast<std::size_t>(id)}).c_str();
//printf("%s", replit_tokenizer_detokenize(vocab, {static_cast<std::size_t>(id)}).c_str());
}
//fflush(stdout);
// end of text token
if (embd.back() == 0) {
break;
}
}
// ggml_free(model.ctx);
return result;
}
bool ReplitModel::load_model(std::string fname){
return replit_model_load(fname, model, vocab);
}