rename LLamaModel to LlamaCppBackendImpl

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel 2024-08-06 16:47:58 -04:00
parent 43b6f63589
commit 9808be5e73
7 changed files with 67 additions and 65 deletions

2
.gitmodules vendored
View File

@ -1,5 +1,5 @@
[submodule "llama.cpp-mainline"]
path = gpt4all-backend/llama.cpp-mainline
path = gpt4all-backend/llama.cpp
url = https://github.com/nomic-ai/llama.cpp.git
branch = master
[submodule "gpt4all-chat/usearch"]

View File

@ -47,7 +47,7 @@ else()
message(STATUS "Interprocedural optimization support detected")
endif()
set(DIRECTORY llama.cpp-mainline)
set(DIRECTORY llama.cpp)
include(llama.cpp.cmake)
set(BUILD_VARIANTS)
@ -108,7 +108,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
endif()
# Include GGML
include_ggml(-mainline-${BUILD_VARIANT})
include_ggml(-${BUILD_VARIANT})
# Function for preparing individual implementations
function(prepare_target TARGET_NAME BASE_LIB)
@ -127,11 +127,10 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
endfunction()
# Add each individual implementations
add_library(llamamodel-mainline-${BUILD_VARIANT} SHARED
llamamodel.cpp llmodel_shared.cpp)
target_compile_definitions(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
add_library(llamacpp-${BUILD_VARIANT} SHARED llamacpp_backend_impl.cpp)
target_compile_definitions(llamacpp-${BUILD_VARIANT} PRIVATE
LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
prepare_target(llamamodel-mainline llama-mainline)
prepare_target(llamacpp llama)
if (NOT PROJECT_IS_TOP_LEVEL AND BUILD_VARIANT STREQUAL cuda)
set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)

View File

@ -1,5 +1,5 @@
#define LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
#include "llamamodel_impl.h"
#define LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
#include "llamacpp_backend_impl.h"
#include "llmodel.h"
@ -232,7 +232,7 @@ cleanup:
return value;
}
struct LLamaPrivate {
struct LlamaPrivate {
const std::string modelPath;
bool modelLoaded = false;
int device = -1;
@ -246,8 +246,8 @@ struct LLamaPrivate {
const char *backend_name = nullptr;
};
LLamaModel::LLamaModel()
: d_ptr(new LLamaPrivate) {}
LlamaCppBackendImpl::LlamaCppBackendImpl()
: d_ptr(new LlamaPrivate) {}
// default hparams (LLaMA 7B)
struct llama_file_hparams {
@ -260,7 +260,7 @@ struct llama_file_hparams {
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
};
size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
size_t LlamaCppBackendImpl::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
{
// TODO(cebtenzzre): update to GGUF
(void)ngl; // FIXME(cetenzzre): use this value
@ -285,7 +285,7 @@ size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
return filesize + est_kvcache_size;
}
bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const
bool LlamaCppBackendImpl::isModelBlacklisted(const std::string &modelPath) const
{
auto * ctx = load_gguf(modelPath.c_str());
if (!ctx) {
@ -322,7 +322,7 @@ bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const
return res;
}
bool LLamaModel::isEmbeddingModel(const std::string &modelPath) const
bool LlamaCppBackendImpl::isEmbeddingModel(const std::string &modelPath) const
{
bool result = false;
std::string arch;
@ -346,7 +346,7 @@ cleanup:
return result;
}
bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
bool LlamaCppBackendImpl::loadModel(const std::string &modelPath, int n_ctx, int ngl)
{
d_ptr->modelLoaded = false;
@ -488,18 +488,18 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
return true;
}
void LLamaModel::setThreadCount(int32_t n_threads)
void LlamaCppBackendImpl::setThreadCount(int32_t n_threads)
{
d_ptr->n_threads = n_threads;
llama_set_n_threads(d_ptr->ctx, n_threads, n_threads);
}
int32_t LLamaModel::threadCount() const
int32_t LlamaCppBackendImpl::threadCount() const
{
return d_ptr->n_threads;
}
LLamaModel::~LLamaModel()
LlamaCppBackendImpl::~LlamaCppBackendImpl()
{
if (d_ptr->ctx) {
llama_free(d_ptr->ctx);
@ -507,28 +507,28 @@ LLamaModel::~LLamaModel()
llama_free_model(d_ptr->model);
}
bool LLamaModel::isModelLoaded() const
bool LlamaCppBackendImpl::isModelLoaded() const
{
return d_ptr->modelLoaded;
}
size_t LLamaModel::stateSize() const
size_t LlamaCppBackendImpl::stateSize() const
{
return llama_get_state_size(d_ptr->ctx);
}
size_t LLamaModel::saveState(uint8_t *dest) const
size_t LlamaCppBackendImpl::saveState(uint8_t *dest) const
{
return llama_copy_state_data(d_ptr->ctx, dest);
}
size_t LLamaModel::restoreState(const uint8_t *src)
size_t LlamaCppBackendImpl::restoreState(const uint8_t *src)
{
// const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540
return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
}
std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str, bool special)
std::vector<LLModel::Token> LlamaCppBackendImpl::tokenize(PromptContext &ctx, const std::string &str, bool special)
{
bool atStart = m_tokenize_last_token == -1;
bool insertSpace = atStart || isSpecialToken(m_tokenize_last_token);
@ -543,13 +543,13 @@ std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::
return fres;
}
bool LLamaModel::isSpecialToken(Token id) const
bool LlamaCppBackendImpl::isSpecialToken(Token id) const
{
return llama_token_get_attr(d_ptr->model, id)
& (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN);
}
std::string LLamaModel::tokenToString(Token id) const
std::string LlamaCppBackendImpl::tokenToString(Token id) const
{
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), 0, true);
@ -565,7 +565,7 @@ std::string LLamaModel::tokenToString(Token id) const
return std::string(result.data(), result.size());
}
LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
LLModel::Token LlamaCppBackendImpl::sampleToken(PromptContext &promptCtx) const
{
const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
return llama_sample_top_p_top_k(d_ptr->ctx,
@ -574,7 +574,7 @@ LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
promptCtx.repeat_penalty);
}
bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
bool LlamaCppBackendImpl::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
{
llama_kv_cache_seq_rm(d_ptr->ctx, 0, ctx.n_past, -1);
@ -598,7 +598,7 @@ bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &toke
return res == 0;
}
void LLamaModel::shiftContext(PromptContext &promptCtx)
void LlamaCppBackendImpl::shiftContext(PromptContext &promptCtx)
{
// infinite text generation via context shifting
@ -622,27 +622,27 @@ void LLamaModel::shiftContext(PromptContext &promptCtx)
promptCtx.n_past = promptCtx.tokens.size();
}
int32_t LLamaModel::contextLength() const
int32_t LlamaCppBackendImpl::contextLength() const
{
return llama_n_ctx(d_ptr->ctx);
}
const std::vector<LLModel::Token> &LLamaModel::endTokens() const
const std::vector<LLModel::Token> &LlamaCppBackendImpl::endTokens() const
{
return d_ptr->end_tokens;
}
bool LLamaModel::shouldAddBOS() const
bool LlamaCppBackendImpl::shouldAddBOS() const
{
return llama_add_bos_token(d_ptr->model);
}
int32_t LLamaModel::maxContextLength(std::string const &modelPath) const
int32_t LlamaCppBackendImpl::maxContextLength(std::string const &modelPath) const
{
return get_arch_key_u32(modelPath, "context_length");
}
int32_t LLamaModel::layerCount(std::string const &modelPath) const
int32_t LlamaCppBackendImpl::layerCount(std::string const &modelPath) const
{
return get_arch_key_u32(modelPath, "block_count");
}
@ -659,7 +659,7 @@ static const char *getVulkanVendorName(uint32_t vendorID)
}
#endif
std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired) const
std::vector<LLModel::GPUDevice> LlamaCppBackendImpl::availableGPUDevices(size_t memoryRequired) const
{
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
size_t count = 0;
@ -724,7 +724,7 @@ std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryReq
return {};
}
bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
bool LlamaCppBackendImpl::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
{
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
auto devices = availableGPUDevices(memoryRequired);
@ -761,7 +761,7 @@ bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &n
return false;
}
bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) const
bool LlamaCppBackendImpl::initializeGPUDevice(int device, std::string *unavail_reason) const
{
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
(void)unavail_reason;
@ -779,7 +779,7 @@ bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) co
#endif
}
bool LLamaModel::usingGPUDevice() const
bool LlamaCppBackendImpl::usingGPUDevice() const
{
if (!d_ptr->model)
return false;
@ -791,12 +791,12 @@ bool LLamaModel::usingGPUDevice() const
return usingGPU;
}
const char *LLamaModel::backendName() const
const char *LlamaCppBackendImpl::backendName() const
{
return d_ptr->backend_name;
}
const char *LLamaModel::gpuDeviceName() const
const char *LlamaCppBackendImpl::gpuDeviceName() const
{
if (usingGPUDevice()) {
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
@ -832,7 +832,7 @@ static void batch_add_seq(llama_batch &batch, const std::vector<LLModel::Token>
}
}
size_t LLamaModel::embeddingSize() const
size_t LlamaCppBackendImpl::embeddingSize() const
{
return llama_n_embd(d_ptr->model);
}
@ -895,7 +895,7 @@ static const EmbModelSpec *getEmbedSpec(const std::string &modelName) {
return it < std::end(specs) ? &it->spec : nullptr;
}
void LLamaModel::embed(
void LlamaCppBackendImpl::embed(
const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
bool doMean, bool atlas
) {
@ -907,7 +907,7 @@ void LLamaModel::embed(
embed(texts, embeddings, prefix, dimensionality, tokenCount, doMean, atlas);
}
void LLamaModel::embed(
void LlamaCppBackendImpl::embed(
const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb
) {
@ -965,7 +965,7 @@ double getL2NormScale(T *start, T *end)
return 1.0 / std::max(magnitude, 1e-12);
}
void LLamaModel::embedInternal(
void LlamaCppBackendImpl::embedInternal(
const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb, const EmbModelSpec *spec
) {
@ -1223,6 +1223,6 @@ DLL_EXPORT LLModel *construct()
#ifdef GGML_USE_CUDA
ggml_backend_cuda_log_set_callback(cuda_log_callback, nullptr);
#endif
return new LLamaModel;
return new LlamaCppBackendImpl;
}
}

View File

@ -1,8 +1,8 @@
#ifndef LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
#error This file is NOT meant to be included outside of llamamodel.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
#ifndef LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
#error This file is NOT meant to be included outside of llamacpp_backend_impl.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
#endif
#ifndef LLAMAMODEL_H
#define LLAMAMODEL_H
#ifndef LLAMACPP_BACKEND_IMPL_H
#define LLAMACPP_BACKEND_IMPL_H
#include "llmodel.h"
@ -10,13 +10,13 @@
#include <string>
#include <vector>
struct LLamaPrivate;
struct LlamaPrivate;
struct EmbModelSpec;
class LLamaModel : public LLModel {
class LlamaCppBackendImpl : public LLModel {
public:
LLamaModel();
~LLamaModel();
LlamaCppBackendImpl();
~LlamaCppBackendImpl();
bool supportsEmbedding() const override { return m_supportsEmbedding; }
bool supportsCompletion() const override { return m_supportsCompletion; }
@ -47,7 +47,7 @@ public:
size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
private:
std::unique_ptr<LLamaPrivate> d_ptr;
std::unique_ptr<LlamaPrivate> d_ptr;
bool m_supportsEmbedding = false;
bool m_supportsCompletion = false;
@ -69,4 +69,4 @@ protected:
const EmbModelSpec *spec);
};
#endif // LLAMAMODEL_H
#endif // LLAMACPP_BACKEND_IMPL_H

View File

@ -130,7 +130,7 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
addCudaSearchPath();
std::string impl_name_re = "llamamodel-mainline-(cpu|metal|kompute|vulkan|cuda)";
std::string impl_name_re = "llamacpp-(cpu|metal|kompute|vulkan|cuda)";
if (cpu_supports_avx2() == 0) {
impl_name_re += "-avxonly";
}
@ -146,7 +146,10 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
const fs::path &p = f.path();
if (p.extension() != LIB_FILE_EXT) continue;
if (!std::regex_search(p.stem().string(), re)) continue;
if (!std::regex_search(p.stem().string(), re)) {
std::cerr << "did not match regex: " << p.stem().string() << "\n";
continue;
}
// Add to list if model implementation
Dlhandle dl;

View File

@ -326,18 +326,18 @@ install(
# to the this component's dir for the finicky qt installer to work
if (LLMODEL_KOMPUTE)
set(MODEL_IMPL_TARGETS
llamamodel-mainline-kompute
llamamodel-mainline-kompute-avxonly
llamacpp-kompute
llamacpp-kompute-avxonly
)
else()
set(MODEL_IMPL_TARGETS
llamamodel-mainline-cpu
llamamodel-mainline-cpu-avxonly
llamacpp-cpu
llamacpp-cpu-avxonly
)
endif()
if (APPLE)
list(APPEND MODEL_IMPL_TARGETS llamamodel-mainline-metal)
list(APPEND MODEL_IMPL_TARGETS llamacpp-metal)
endif()
install(
@ -365,12 +365,12 @@ if(WIN32 AND GPT4ALL_SIGN_INSTALL)
endif()
if (LLMODEL_CUDA)
set_property(TARGET llamamodel-mainline-cuda llamamodel-mainline-cuda-avxonly
set_property(TARGET llamacpp-cuda llamacpp-cuda-avxonly
APPEND PROPERTY INSTALL_RPATH "$ORIGIN")
install(
TARGETS llamamodel-mainline-cuda
llamamodel-mainline-cuda-avxonly
TARGETS llamacpp-cuda
llamacpp-cuda-avxonly
RUNTIME_DEPENDENCY_SET llama-cuda-deps
LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN} # .so/.dylib
RUNTIME DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN} # .dll