mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2024-10-01 01:06:10 -04:00
rename LLamaModel to LlamaCppBackendImpl
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
parent
43b6f63589
commit
9808be5e73
2
.gitmodules
vendored
2
.gitmodules
vendored
@ -1,5 +1,5 @@
|
||||
[submodule "llama.cpp-mainline"]
|
||||
path = gpt4all-backend/llama.cpp-mainline
|
||||
path = gpt4all-backend/llama.cpp
|
||||
url = https://github.com/nomic-ai/llama.cpp.git
|
||||
branch = master
|
||||
[submodule "gpt4all-chat/usearch"]
|
||||
|
@ -47,7 +47,7 @@ else()
|
||||
message(STATUS "Interprocedural optimization support detected")
|
||||
endif()
|
||||
|
||||
set(DIRECTORY llama.cpp-mainline)
|
||||
set(DIRECTORY llama.cpp)
|
||||
include(llama.cpp.cmake)
|
||||
|
||||
set(BUILD_VARIANTS)
|
||||
@ -108,7 +108,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
||||
endif()
|
||||
|
||||
# Include GGML
|
||||
include_ggml(-mainline-${BUILD_VARIANT})
|
||||
include_ggml(-${BUILD_VARIANT})
|
||||
|
||||
# Function for preparing individual implementations
|
||||
function(prepare_target TARGET_NAME BASE_LIB)
|
||||
@ -127,11 +127,10 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
||||
endfunction()
|
||||
|
||||
# Add each individual implementations
|
||||
add_library(llamamodel-mainline-${BUILD_VARIANT} SHARED
|
||||
llamamodel.cpp llmodel_shared.cpp)
|
||||
target_compile_definitions(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
|
||||
add_library(llamacpp-${BUILD_VARIANT} SHARED llamacpp_backend_impl.cpp)
|
||||
target_compile_definitions(llamacpp-${BUILD_VARIANT} PRIVATE
|
||||
LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
|
||||
prepare_target(llamamodel-mainline llama-mainline)
|
||||
prepare_target(llamacpp llama)
|
||||
|
||||
if (NOT PROJECT_IS_TOP_LEVEL AND BUILD_VARIANT STREQUAL cuda)
|
||||
set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
|
||||
|
@ -1,5 +1,5 @@
|
||||
#define LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||
#include "llamamodel_impl.h"
|
||||
#define LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||
#include "llamacpp_backend_impl.h"
|
||||
|
||||
#include "llmodel.h"
|
||||
|
||||
@ -232,7 +232,7 @@ cleanup:
|
||||
return value;
|
||||
}
|
||||
|
||||
struct LLamaPrivate {
|
||||
struct LlamaPrivate {
|
||||
const std::string modelPath;
|
||||
bool modelLoaded = false;
|
||||
int device = -1;
|
||||
@ -246,8 +246,8 @@ struct LLamaPrivate {
|
||||
const char *backend_name = nullptr;
|
||||
};
|
||||
|
||||
LLamaModel::LLamaModel()
|
||||
: d_ptr(new LLamaPrivate) {}
|
||||
LlamaCppBackendImpl::LlamaCppBackendImpl()
|
||||
: d_ptr(new LlamaPrivate) {}
|
||||
|
||||
// default hparams (LLaMA 7B)
|
||||
struct llama_file_hparams {
|
||||
@ -260,7 +260,7 @@ struct llama_file_hparams {
|
||||
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
||||
};
|
||||
|
||||
size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
|
||||
size_t LlamaCppBackendImpl::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
|
||||
{
|
||||
// TODO(cebtenzzre): update to GGUF
|
||||
(void)ngl; // FIXME(cetenzzre): use this value
|
||||
@ -285,7 +285,7 @@ size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
|
||||
return filesize + est_kvcache_size;
|
||||
}
|
||||
|
||||
bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const
|
||||
bool LlamaCppBackendImpl::isModelBlacklisted(const std::string &modelPath) const
|
||||
{
|
||||
auto * ctx = load_gguf(modelPath.c_str());
|
||||
if (!ctx) {
|
||||
@ -322,7 +322,7 @@ bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const
|
||||
return res;
|
||||
}
|
||||
|
||||
bool LLamaModel::isEmbeddingModel(const std::string &modelPath) const
|
||||
bool LlamaCppBackendImpl::isEmbeddingModel(const std::string &modelPath) const
|
||||
{
|
||||
bool result = false;
|
||||
std::string arch;
|
||||
@ -346,7 +346,7 @@ cleanup:
|
||||
return result;
|
||||
}
|
||||
|
||||
bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
||||
bool LlamaCppBackendImpl::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
||||
{
|
||||
d_ptr->modelLoaded = false;
|
||||
|
||||
@ -488,18 +488,18 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
||||
return true;
|
||||
}
|
||||
|
||||
void LLamaModel::setThreadCount(int32_t n_threads)
|
||||
void LlamaCppBackendImpl::setThreadCount(int32_t n_threads)
|
||||
{
|
||||
d_ptr->n_threads = n_threads;
|
||||
llama_set_n_threads(d_ptr->ctx, n_threads, n_threads);
|
||||
}
|
||||
|
||||
int32_t LLamaModel::threadCount() const
|
||||
int32_t LlamaCppBackendImpl::threadCount() const
|
||||
{
|
||||
return d_ptr->n_threads;
|
||||
}
|
||||
|
||||
LLamaModel::~LLamaModel()
|
||||
LlamaCppBackendImpl::~LlamaCppBackendImpl()
|
||||
{
|
||||
if (d_ptr->ctx) {
|
||||
llama_free(d_ptr->ctx);
|
||||
@ -507,28 +507,28 @@ LLamaModel::~LLamaModel()
|
||||
llama_free_model(d_ptr->model);
|
||||
}
|
||||
|
||||
bool LLamaModel::isModelLoaded() const
|
||||
bool LlamaCppBackendImpl::isModelLoaded() const
|
||||
{
|
||||
return d_ptr->modelLoaded;
|
||||
}
|
||||
|
||||
size_t LLamaModel::stateSize() const
|
||||
size_t LlamaCppBackendImpl::stateSize() const
|
||||
{
|
||||
return llama_get_state_size(d_ptr->ctx);
|
||||
}
|
||||
|
||||
size_t LLamaModel::saveState(uint8_t *dest) const
|
||||
size_t LlamaCppBackendImpl::saveState(uint8_t *dest) const
|
||||
{
|
||||
return llama_copy_state_data(d_ptr->ctx, dest);
|
||||
}
|
||||
|
||||
size_t LLamaModel::restoreState(const uint8_t *src)
|
||||
size_t LlamaCppBackendImpl::restoreState(const uint8_t *src)
|
||||
{
|
||||
// const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540
|
||||
return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
|
||||
}
|
||||
|
||||
std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str, bool special)
|
||||
std::vector<LLModel::Token> LlamaCppBackendImpl::tokenize(PromptContext &ctx, const std::string &str, bool special)
|
||||
{
|
||||
bool atStart = m_tokenize_last_token == -1;
|
||||
bool insertSpace = atStart || isSpecialToken(m_tokenize_last_token);
|
||||
@ -543,13 +543,13 @@ std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::
|
||||
return fres;
|
||||
}
|
||||
|
||||
bool LLamaModel::isSpecialToken(Token id) const
|
||||
bool LlamaCppBackendImpl::isSpecialToken(Token id) const
|
||||
{
|
||||
return llama_token_get_attr(d_ptr->model, id)
|
||||
& (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN);
|
||||
}
|
||||
|
||||
std::string LLamaModel::tokenToString(Token id) const
|
||||
std::string LlamaCppBackendImpl::tokenToString(Token id) const
|
||||
{
|
||||
std::vector<char> result(8, 0);
|
||||
const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), 0, true);
|
||||
@ -565,7 +565,7 @@ std::string LLamaModel::tokenToString(Token id) const
|
||||
return std::string(result.data(), result.size());
|
||||
}
|
||||
|
||||
LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
|
||||
LLModel::Token LlamaCppBackendImpl::sampleToken(PromptContext &promptCtx) const
|
||||
{
|
||||
const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
|
||||
return llama_sample_top_p_top_k(d_ptr->ctx,
|
||||
@ -574,7 +574,7 @@ LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
|
||||
promptCtx.repeat_penalty);
|
||||
}
|
||||
|
||||
bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
|
||||
bool LlamaCppBackendImpl::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
|
||||
{
|
||||
llama_kv_cache_seq_rm(d_ptr->ctx, 0, ctx.n_past, -1);
|
||||
|
||||
@ -598,7 +598,7 @@ bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &toke
|
||||
return res == 0;
|
||||
}
|
||||
|
||||
void LLamaModel::shiftContext(PromptContext &promptCtx)
|
||||
void LlamaCppBackendImpl::shiftContext(PromptContext &promptCtx)
|
||||
{
|
||||
// infinite text generation via context shifting
|
||||
|
||||
@ -622,27 +622,27 @@ void LLamaModel::shiftContext(PromptContext &promptCtx)
|
||||
promptCtx.n_past = promptCtx.tokens.size();
|
||||
}
|
||||
|
||||
int32_t LLamaModel::contextLength() const
|
||||
int32_t LlamaCppBackendImpl::contextLength() const
|
||||
{
|
||||
return llama_n_ctx(d_ptr->ctx);
|
||||
}
|
||||
|
||||
const std::vector<LLModel::Token> &LLamaModel::endTokens() const
|
||||
const std::vector<LLModel::Token> &LlamaCppBackendImpl::endTokens() const
|
||||
{
|
||||
return d_ptr->end_tokens;
|
||||
}
|
||||
|
||||
bool LLamaModel::shouldAddBOS() const
|
||||
bool LlamaCppBackendImpl::shouldAddBOS() const
|
||||
{
|
||||
return llama_add_bos_token(d_ptr->model);
|
||||
}
|
||||
|
||||
int32_t LLamaModel::maxContextLength(std::string const &modelPath) const
|
||||
int32_t LlamaCppBackendImpl::maxContextLength(std::string const &modelPath) const
|
||||
{
|
||||
return get_arch_key_u32(modelPath, "context_length");
|
||||
}
|
||||
|
||||
int32_t LLamaModel::layerCount(std::string const &modelPath) const
|
||||
int32_t LlamaCppBackendImpl::layerCount(std::string const &modelPath) const
|
||||
{
|
||||
return get_arch_key_u32(modelPath, "block_count");
|
||||
}
|
||||
@ -659,7 +659,7 @@ static const char *getVulkanVendorName(uint32_t vendorID)
|
||||
}
|
||||
#endif
|
||||
|
||||
std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired) const
|
||||
std::vector<LLModel::GPUDevice> LlamaCppBackendImpl::availableGPUDevices(size_t memoryRequired) const
|
||||
{
|
||||
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
||||
size_t count = 0;
|
||||
@ -724,7 +724,7 @@ std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryReq
|
||||
return {};
|
||||
}
|
||||
|
||||
bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
|
||||
bool LlamaCppBackendImpl::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
|
||||
{
|
||||
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
||||
auto devices = availableGPUDevices(memoryRequired);
|
||||
@ -761,7 +761,7 @@ bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &n
|
||||
return false;
|
||||
}
|
||||
|
||||
bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) const
|
||||
bool LlamaCppBackendImpl::initializeGPUDevice(int device, std::string *unavail_reason) const
|
||||
{
|
||||
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
||||
(void)unavail_reason;
|
||||
@ -779,7 +779,7 @@ bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) co
|
||||
#endif
|
||||
}
|
||||
|
||||
bool LLamaModel::usingGPUDevice() const
|
||||
bool LlamaCppBackendImpl::usingGPUDevice() const
|
||||
{
|
||||
if (!d_ptr->model)
|
||||
return false;
|
||||
@ -791,12 +791,12 @@ bool LLamaModel::usingGPUDevice() const
|
||||
return usingGPU;
|
||||
}
|
||||
|
||||
const char *LLamaModel::backendName() const
|
||||
const char *LlamaCppBackendImpl::backendName() const
|
||||
{
|
||||
return d_ptr->backend_name;
|
||||
}
|
||||
|
||||
const char *LLamaModel::gpuDeviceName() const
|
||||
const char *LlamaCppBackendImpl::gpuDeviceName() const
|
||||
{
|
||||
if (usingGPUDevice()) {
|
||||
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
||||
@ -832,7 +832,7 @@ static void batch_add_seq(llama_batch &batch, const std::vector<LLModel::Token>
|
||||
}
|
||||
}
|
||||
|
||||
size_t LLamaModel::embeddingSize() const
|
||||
size_t LlamaCppBackendImpl::embeddingSize() const
|
||||
{
|
||||
return llama_n_embd(d_ptr->model);
|
||||
}
|
||||
@ -895,7 +895,7 @@ static const EmbModelSpec *getEmbedSpec(const std::string &modelName) {
|
||||
return it < std::end(specs) ? &it->spec : nullptr;
|
||||
}
|
||||
|
||||
void LLamaModel::embed(
|
||||
void LlamaCppBackendImpl::embed(
|
||||
const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
|
||||
bool doMean, bool atlas
|
||||
) {
|
||||
@ -907,7 +907,7 @@ void LLamaModel::embed(
|
||||
embed(texts, embeddings, prefix, dimensionality, tokenCount, doMean, atlas);
|
||||
}
|
||||
|
||||
void LLamaModel::embed(
|
||||
void LlamaCppBackendImpl::embed(
|
||||
const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
|
||||
size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb
|
||||
) {
|
||||
@ -965,7 +965,7 @@ double getL2NormScale(T *start, T *end)
|
||||
return 1.0 / std::max(magnitude, 1e-12);
|
||||
}
|
||||
|
||||
void LLamaModel::embedInternal(
|
||||
void LlamaCppBackendImpl::embedInternal(
|
||||
const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
|
||||
size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb, const EmbModelSpec *spec
|
||||
) {
|
||||
@ -1223,6 +1223,6 @@ DLL_EXPORT LLModel *construct()
|
||||
#ifdef GGML_USE_CUDA
|
||||
ggml_backend_cuda_log_set_callback(cuda_log_callback, nullptr);
|
||||
#endif
|
||||
return new LLamaModel;
|
||||
return new LlamaCppBackendImpl;
|
||||
}
|
||||
}
|
@ -1,8 +1,8 @@
|
||||
#ifndef LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||
#error This file is NOT meant to be included outside of llamamodel.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||
#ifndef LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||
#error This file is NOT meant to be included outside of llamacpp_backend_impl.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||
#endif
|
||||
#ifndef LLAMAMODEL_H
|
||||
#define LLAMAMODEL_H
|
||||
#ifndef LLAMACPP_BACKEND_IMPL_H
|
||||
#define LLAMACPP_BACKEND_IMPL_H
|
||||
|
||||
#include "llmodel.h"
|
||||
|
||||
@ -10,13 +10,13 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
struct LLamaPrivate;
|
||||
struct LlamaPrivate;
|
||||
struct EmbModelSpec;
|
||||
|
||||
class LLamaModel : public LLModel {
|
||||
class LlamaCppBackendImpl : public LLModel {
|
||||
public:
|
||||
LLamaModel();
|
||||
~LLamaModel();
|
||||
LlamaCppBackendImpl();
|
||||
~LlamaCppBackendImpl();
|
||||
|
||||
bool supportsEmbedding() const override { return m_supportsEmbedding; }
|
||||
bool supportsCompletion() const override { return m_supportsCompletion; }
|
||||
@ -47,7 +47,7 @@ public:
|
||||
size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
|
||||
|
||||
private:
|
||||
std::unique_ptr<LLamaPrivate> d_ptr;
|
||||
std::unique_ptr<LlamaPrivate> d_ptr;
|
||||
bool m_supportsEmbedding = false;
|
||||
bool m_supportsCompletion = false;
|
||||
|
||||
@ -69,4 +69,4 @@ protected:
|
||||
const EmbModelSpec *spec);
|
||||
};
|
||||
|
||||
#endif // LLAMAMODEL_H
|
||||
#endif // LLAMACPP_BACKEND_IMPL_H
|
@ -130,7 +130,7 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
|
||||
|
||||
addCudaSearchPath();
|
||||
|
||||
std::string impl_name_re = "llamamodel-mainline-(cpu|metal|kompute|vulkan|cuda)";
|
||||
std::string impl_name_re = "llamacpp-(cpu|metal|kompute|vulkan|cuda)";
|
||||
if (cpu_supports_avx2() == 0) {
|
||||
impl_name_re += "-avxonly";
|
||||
}
|
||||
@ -146,7 +146,10 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
|
||||
const fs::path &p = f.path();
|
||||
|
||||
if (p.extension() != LIB_FILE_EXT) continue;
|
||||
if (!std::regex_search(p.stem().string(), re)) continue;
|
||||
if (!std::regex_search(p.stem().string(), re)) {
|
||||
std::cerr << "did not match regex: " << p.stem().string() << "\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
// Add to list if model implementation
|
||||
Dlhandle dl;
|
||||
|
@ -326,18 +326,18 @@ install(
|
||||
# to the this component's dir for the finicky qt installer to work
|
||||
if (LLMODEL_KOMPUTE)
|
||||
set(MODEL_IMPL_TARGETS
|
||||
llamamodel-mainline-kompute
|
||||
llamamodel-mainline-kompute-avxonly
|
||||
llamacpp-kompute
|
||||
llamacpp-kompute-avxonly
|
||||
)
|
||||
else()
|
||||
set(MODEL_IMPL_TARGETS
|
||||
llamamodel-mainline-cpu
|
||||
llamamodel-mainline-cpu-avxonly
|
||||
llamacpp-cpu
|
||||
llamacpp-cpu-avxonly
|
||||
)
|
||||
endif()
|
||||
|
||||
if (APPLE)
|
||||
list(APPEND MODEL_IMPL_TARGETS llamamodel-mainline-metal)
|
||||
list(APPEND MODEL_IMPL_TARGETS llamacpp-metal)
|
||||
endif()
|
||||
|
||||
install(
|
||||
@ -365,12 +365,12 @@ if(WIN32 AND GPT4ALL_SIGN_INSTALL)
|
||||
endif()
|
||||
|
||||
if (LLMODEL_CUDA)
|
||||
set_property(TARGET llamamodel-mainline-cuda llamamodel-mainline-cuda-avxonly
|
||||
set_property(TARGET llamacpp-cuda llamacpp-cuda-avxonly
|
||||
APPEND PROPERTY INSTALL_RPATH "$ORIGIN")
|
||||
|
||||
install(
|
||||
TARGETS llamamodel-mainline-cuda
|
||||
llamamodel-mainline-cuda-avxonly
|
||||
TARGETS llamacpp-cuda
|
||||
llamacpp-cuda-avxonly
|
||||
RUNTIME_DEPENDENCY_SET llama-cuda-deps
|
||||
LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN} # .so/.dylib
|
||||
RUNTIME DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN} # .dll
|
||||
|
Loading…
Reference in New Issue
Block a user