mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2024-10-01 01:06:10 -04:00
rename LLamaModel to LlamaCppBackendImpl
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
parent
43b6f63589
commit
9808be5e73
2
.gitmodules
vendored
2
.gitmodules
vendored
@ -1,5 +1,5 @@
|
|||||||
[submodule "llama.cpp-mainline"]
|
[submodule "llama.cpp-mainline"]
|
||||||
path = gpt4all-backend/llama.cpp-mainline
|
path = gpt4all-backend/llama.cpp
|
||||||
url = https://github.com/nomic-ai/llama.cpp.git
|
url = https://github.com/nomic-ai/llama.cpp.git
|
||||||
branch = master
|
branch = master
|
||||||
[submodule "gpt4all-chat/usearch"]
|
[submodule "gpt4all-chat/usearch"]
|
||||||
|
@ -47,7 +47,7 @@ else()
|
|||||||
message(STATUS "Interprocedural optimization support detected")
|
message(STATUS "Interprocedural optimization support detected")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(DIRECTORY llama.cpp-mainline)
|
set(DIRECTORY llama.cpp)
|
||||||
include(llama.cpp.cmake)
|
include(llama.cpp.cmake)
|
||||||
|
|
||||||
set(BUILD_VARIANTS)
|
set(BUILD_VARIANTS)
|
||||||
@ -108,7 +108,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include GGML
|
# Include GGML
|
||||||
include_ggml(-mainline-${BUILD_VARIANT})
|
include_ggml(-${BUILD_VARIANT})
|
||||||
|
|
||||||
# Function for preparing individual implementations
|
# Function for preparing individual implementations
|
||||||
function(prepare_target TARGET_NAME BASE_LIB)
|
function(prepare_target TARGET_NAME BASE_LIB)
|
||||||
@ -127,11 +127,10 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
|||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
# Add each individual implementations
|
# Add each individual implementations
|
||||||
add_library(llamamodel-mainline-${BUILD_VARIANT} SHARED
|
add_library(llamacpp-${BUILD_VARIANT} SHARED llamacpp_backend_impl.cpp)
|
||||||
llamamodel.cpp llmodel_shared.cpp)
|
target_compile_definitions(llamacpp-${BUILD_VARIANT} PRIVATE
|
||||||
target_compile_definitions(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
|
|
||||||
LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
|
LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
|
||||||
prepare_target(llamamodel-mainline llama-mainline)
|
prepare_target(llamacpp llama)
|
||||||
|
|
||||||
if (NOT PROJECT_IS_TOP_LEVEL AND BUILD_VARIANT STREQUAL cuda)
|
if (NOT PROJECT_IS_TOP_LEVEL AND BUILD_VARIANT STREQUAL cuda)
|
||||||
set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
|
set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#define LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
#define LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
#include "llamamodel_impl.h"
|
#include "llamacpp_backend_impl.h"
|
||||||
|
|
||||||
#include "llmodel.h"
|
#include "llmodel.h"
|
||||||
|
|
||||||
@ -232,7 +232,7 @@ cleanup:
|
|||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct LLamaPrivate {
|
struct LlamaPrivate {
|
||||||
const std::string modelPath;
|
const std::string modelPath;
|
||||||
bool modelLoaded = false;
|
bool modelLoaded = false;
|
||||||
int device = -1;
|
int device = -1;
|
||||||
@ -246,8 +246,8 @@ struct LLamaPrivate {
|
|||||||
const char *backend_name = nullptr;
|
const char *backend_name = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
LLamaModel::LLamaModel()
|
LlamaCppBackendImpl::LlamaCppBackendImpl()
|
||||||
: d_ptr(new LLamaPrivate) {}
|
: d_ptr(new LlamaPrivate) {}
|
||||||
|
|
||||||
// default hparams (LLaMA 7B)
|
// default hparams (LLaMA 7B)
|
||||||
struct llama_file_hparams {
|
struct llama_file_hparams {
|
||||||
@ -260,7 +260,7 @@ struct llama_file_hparams {
|
|||||||
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
||||||
};
|
};
|
||||||
|
|
||||||
size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
|
size_t LlamaCppBackendImpl::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
|
||||||
{
|
{
|
||||||
// TODO(cebtenzzre): update to GGUF
|
// TODO(cebtenzzre): update to GGUF
|
||||||
(void)ngl; // FIXME(cetenzzre): use this value
|
(void)ngl; // FIXME(cetenzzre): use this value
|
||||||
@ -285,7 +285,7 @@ size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
|
|||||||
return filesize + est_kvcache_size;
|
return filesize + est_kvcache_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const
|
bool LlamaCppBackendImpl::isModelBlacklisted(const std::string &modelPath) const
|
||||||
{
|
{
|
||||||
auto * ctx = load_gguf(modelPath.c_str());
|
auto * ctx = load_gguf(modelPath.c_str());
|
||||||
if (!ctx) {
|
if (!ctx) {
|
||||||
@ -322,7 +322,7 @@ bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LLamaModel::isEmbeddingModel(const std::string &modelPath) const
|
bool LlamaCppBackendImpl::isEmbeddingModel(const std::string &modelPath) const
|
||||||
{
|
{
|
||||||
bool result = false;
|
bool result = false;
|
||||||
std::string arch;
|
std::string arch;
|
||||||
@ -346,7 +346,7 @@ cleanup:
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
bool LlamaCppBackendImpl::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
||||||
{
|
{
|
||||||
d_ptr->modelLoaded = false;
|
d_ptr->modelLoaded = false;
|
||||||
|
|
||||||
@ -488,18 +488,18 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void LLamaModel::setThreadCount(int32_t n_threads)
|
void LlamaCppBackendImpl::setThreadCount(int32_t n_threads)
|
||||||
{
|
{
|
||||||
d_ptr->n_threads = n_threads;
|
d_ptr->n_threads = n_threads;
|
||||||
llama_set_n_threads(d_ptr->ctx, n_threads, n_threads);
|
llama_set_n_threads(d_ptr->ctx, n_threads, n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t LLamaModel::threadCount() const
|
int32_t LlamaCppBackendImpl::threadCount() const
|
||||||
{
|
{
|
||||||
return d_ptr->n_threads;
|
return d_ptr->n_threads;
|
||||||
}
|
}
|
||||||
|
|
||||||
LLamaModel::~LLamaModel()
|
LlamaCppBackendImpl::~LlamaCppBackendImpl()
|
||||||
{
|
{
|
||||||
if (d_ptr->ctx) {
|
if (d_ptr->ctx) {
|
||||||
llama_free(d_ptr->ctx);
|
llama_free(d_ptr->ctx);
|
||||||
@ -507,28 +507,28 @@ LLamaModel::~LLamaModel()
|
|||||||
llama_free_model(d_ptr->model);
|
llama_free_model(d_ptr->model);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LLamaModel::isModelLoaded() const
|
bool LlamaCppBackendImpl::isModelLoaded() const
|
||||||
{
|
{
|
||||||
return d_ptr->modelLoaded;
|
return d_ptr->modelLoaded;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t LLamaModel::stateSize() const
|
size_t LlamaCppBackendImpl::stateSize() const
|
||||||
{
|
{
|
||||||
return llama_get_state_size(d_ptr->ctx);
|
return llama_get_state_size(d_ptr->ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t LLamaModel::saveState(uint8_t *dest) const
|
size_t LlamaCppBackendImpl::saveState(uint8_t *dest) const
|
||||||
{
|
{
|
||||||
return llama_copy_state_data(d_ptr->ctx, dest);
|
return llama_copy_state_data(d_ptr->ctx, dest);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t LLamaModel::restoreState(const uint8_t *src)
|
size_t LlamaCppBackendImpl::restoreState(const uint8_t *src)
|
||||||
{
|
{
|
||||||
// const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540
|
// const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540
|
||||||
return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
|
return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str, bool special)
|
std::vector<LLModel::Token> LlamaCppBackendImpl::tokenize(PromptContext &ctx, const std::string &str, bool special)
|
||||||
{
|
{
|
||||||
bool atStart = m_tokenize_last_token == -1;
|
bool atStart = m_tokenize_last_token == -1;
|
||||||
bool insertSpace = atStart || isSpecialToken(m_tokenize_last_token);
|
bool insertSpace = atStart || isSpecialToken(m_tokenize_last_token);
|
||||||
@ -543,13 +543,13 @@ std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::
|
|||||||
return fres;
|
return fres;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LLamaModel::isSpecialToken(Token id) const
|
bool LlamaCppBackendImpl::isSpecialToken(Token id) const
|
||||||
{
|
{
|
||||||
return llama_token_get_attr(d_ptr->model, id)
|
return llama_token_get_attr(d_ptr->model, id)
|
||||||
& (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN);
|
& (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string LLamaModel::tokenToString(Token id) const
|
std::string LlamaCppBackendImpl::tokenToString(Token id) const
|
||||||
{
|
{
|
||||||
std::vector<char> result(8, 0);
|
std::vector<char> result(8, 0);
|
||||||
const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), 0, true);
|
const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), 0, true);
|
||||||
@ -565,7 +565,7 @@ std::string LLamaModel::tokenToString(Token id) const
|
|||||||
return std::string(result.data(), result.size());
|
return std::string(result.data(), result.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
|
LLModel::Token LlamaCppBackendImpl::sampleToken(PromptContext &promptCtx) const
|
||||||
{
|
{
|
||||||
const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
|
const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
|
||||||
return llama_sample_top_p_top_k(d_ptr->ctx,
|
return llama_sample_top_p_top_k(d_ptr->ctx,
|
||||||
@ -574,7 +574,7 @@ LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
|
|||||||
promptCtx.repeat_penalty);
|
promptCtx.repeat_penalty);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
|
bool LlamaCppBackendImpl::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
|
||||||
{
|
{
|
||||||
llama_kv_cache_seq_rm(d_ptr->ctx, 0, ctx.n_past, -1);
|
llama_kv_cache_seq_rm(d_ptr->ctx, 0, ctx.n_past, -1);
|
||||||
|
|
||||||
@ -598,7 +598,7 @@ bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &toke
|
|||||||
return res == 0;
|
return res == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void LLamaModel::shiftContext(PromptContext &promptCtx)
|
void LlamaCppBackendImpl::shiftContext(PromptContext &promptCtx)
|
||||||
{
|
{
|
||||||
// infinite text generation via context shifting
|
// infinite text generation via context shifting
|
||||||
|
|
||||||
@ -622,27 +622,27 @@ void LLamaModel::shiftContext(PromptContext &promptCtx)
|
|||||||
promptCtx.n_past = promptCtx.tokens.size();
|
promptCtx.n_past = promptCtx.tokens.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t LLamaModel::contextLength() const
|
int32_t LlamaCppBackendImpl::contextLength() const
|
||||||
{
|
{
|
||||||
return llama_n_ctx(d_ptr->ctx);
|
return llama_n_ctx(d_ptr->ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::vector<LLModel::Token> &LLamaModel::endTokens() const
|
const std::vector<LLModel::Token> &LlamaCppBackendImpl::endTokens() const
|
||||||
{
|
{
|
||||||
return d_ptr->end_tokens;
|
return d_ptr->end_tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LLamaModel::shouldAddBOS() const
|
bool LlamaCppBackendImpl::shouldAddBOS() const
|
||||||
{
|
{
|
||||||
return llama_add_bos_token(d_ptr->model);
|
return llama_add_bos_token(d_ptr->model);
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t LLamaModel::maxContextLength(std::string const &modelPath) const
|
int32_t LlamaCppBackendImpl::maxContextLength(std::string const &modelPath) const
|
||||||
{
|
{
|
||||||
return get_arch_key_u32(modelPath, "context_length");
|
return get_arch_key_u32(modelPath, "context_length");
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t LLamaModel::layerCount(std::string const &modelPath) const
|
int32_t LlamaCppBackendImpl::layerCount(std::string const &modelPath) const
|
||||||
{
|
{
|
||||||
return get_arch_key_u32(modelPath, "block_count");
|
return get_arch_key_u32(modelPath, "block_count");
|
||||||
}
|
}
|
||||||
@ -659,7 +659,7 @@ static const char *getVulkanVendorName(uint32_t vendorID)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired) const
|
std::vector<LLModel::GPUDevice> LlamaCppBackendImpl::availableGPUDevices(size_t memoryRequired) const
|
||||||
{
|
{
|
||||||
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
||||||
size_t count = 0;
|
size_t count = 0;
|
||||||
@ -724,7 +724,7 @@ std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryReq
|
|||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
|
bool LlamaCppBackendImpl::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
|
||||||
{
|
{
|
||||||
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
||||||
auto devices = availableGPUDevices(memoryRequired);
|
auto devices = availableGPUDevices(memoryRequired);
|
||||||
@ -761,7 +761,7 @@ bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &n
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) const
|
bool LlamaCppBackendImpl::initializeGPUDevice(int device, std::string *unavail_reason) const
|
||||||
{
|
{
|
||||||
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
||||||
(void)unavail_reason;
|
(void)unavail_reason;
|
||||||
@ -779,7 +779,7 @@ bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) co
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LLamaModel::usingGPUDevice() const
|
bool LlamaCppBackendImpl::usingGPUDevice() const
|
||||||
{
|
{
|
||||||
if (!d_ptr->model)
|
if (!d_ptr->model)
|
||||||
return false;
|
return false;
|
||||||
@ -791,12 +791,12 @@ bool LLamaModel::usingGPUDevice() const
|
|||||||
return usingGPU;
|
return usingGPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char *LLamaModel::backendName() const
|
const char *LlamaCppBackendImpl::backendName() const
|
||||||
{
|
{
|
||||||
return d_ptr->backend_name;
|
return d_ptr->backend_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char *LLamaModel::gpuDeviceName() const
|
const char *LlamaCppBackendImpl::gpuDeviceName() const
|
||||||
{
|
{
|
||||||
if (usingGPUDevice()) {
|
if (usingGPUDevice()) {
|
||||||
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
||||||
@ -832,7 +832,7 @@ static void batch_add_seq(llama_batch &batch, const std::vector<LLModel::Token>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t LLamaModel::embeddingSize() const
|
size_t LlamaCppBackendImpl::embeddingSize() const
|
||||||
{
|
{
|
||||||
return llama_n_embd(d_ptr->model);
|
return llama_n_embd(d_ptr->model);
|
||||||
}
|
}
|
||||||
@ -895,7 +895,7 @@ static const EmbModelSpec *getEmbedSpec(const std::string &modelName) {
|
|||||||
return it < std::end(specs) ? &it->spec : nullptr;
|
return it < std::end(specs) ? &it->spec : nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
void LLamaModel::embed(
|
void LlamaCppBackendImpl::embed(
|
||||||
const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
|
const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
|
||||||
bool doMean, bool atlas
|
bool doMean, bool atlas
|
||||||
) {
|
) {
|
||||||
@ -907,7 +907,7 @@ void LLamaModel::embed(
|
|||||||
embed(texts, embeddings, prefix, dimensionality, tokenCount, doMean, atlas);
|
embed(texts, embeddings, prefix, dimensionality, tokenCount, doMean, atlas);
|
||||||
}
|
}
|
||||||
|
|
||||||
void LLamaModel::embed(
|
void LlamaCppBackendImpl::embed(
|
||||||
const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
|
const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
|
||||||
size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb
|
size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb
|
||||||
) {
|
) {
|
||||||
@ -965,7 +965,7 @@ double getL2NormScale(T *start, T *end)
|
|||||||
return 1.0 / std::max(magnitude, 1e-12);
|
return 1.0 / std::max(magnitude, 1e-12);
|
||||||
}
|
}
|
||||||
|
|
||||||
void LLamaModel::embedInternal(
|
void LlamaCppBackendImpl::embedInternal(
|
||||||
const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
|
const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
|
||||||
size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb, const EmbModelSpec *spec
|
size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb, const EmbModelSpec *spec
|
||||||
) {
|
) {
|
||||||
@ -1223,6 +1223,6 @@ DLL_EXPORT LLModel *construct()
|
|||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
ggml_backend_cuda_log_set_callback(cuda_log_callback, nullptr);
|
ggml_backend_cuda_log_set_callback(cuda_log_callback, nullptr);
|
||||||
#endif
|
#endif
|
||||||
return new LLamaModel;
|
return new LlamaCppBackendImpl;
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -1,8 +1,8 @@
|
|||||||
#ifndef LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
#ifndef LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
#error This file is NOT meant to be included outside of llamamodel.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
#error This file is NOT meant to be included outside of llamacpp_backend_impl.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
#endif
|
#endif
|
||||||
#ifndef LLAMAMODEL_H
|
#ifndef LLAMACPP_BACKEND_IMPL_H
|
||||||
#define LLAMAMODEL_H
|
#define LLAMACPP_BACKEND_IMPL_H
|
||||||
|
|
||||||
#include "llmodel.h"
|
#include "llmodel.h"
|
||||||
|
|
||||||
@ -10,13 +10,13 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
struct LLamaPrivate;
|
struct LlamaPrivate;
|
||||||
struct EmbModelSpec;
|
struct EmbModelSpec;
|
||||||
|
|
||||||
class LLamaModel : public LLModel {
|
class LlamaCppBackendImpl : public LLModel {
|
||||||
public:
|
public:
|
||||||
LLamaModel();
|
LlamaCppBackendImpl();
|
||||||
~LLamaModel();
|
~LlamaCppBackendImpl();
|
||||||
|
|
||||||
bool supportsEmbedding() const override { return m_supportsEmbedding; }
|
bool supportsEmbedding() const override { return m_supportsEmbedding; }
|
||||||
bool supportsCompletion() const override { return m_supportsCompletion; }
|
bool supportsCompletion() const override { return m_supportsCompletion; }
|
||||||
@ -47,7 +47,7 @@ public:
|
|||||||
size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
|
size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::unique_ptr<LLamaPrivate> d_ptr;
|
std::unique_ptr<LlamaPrivate> d_ptr;
|
||||||
bool m_supportsEmbedding = false;
|
bool m_supportsEmbedding = false;
|
||||||
bool m_supportsCompletion = false;
|
bool m_supportsCompletion = false;
|
||||||
|
|
||||||
@ -69,4 +69,4 @@ protected:
|
|||||||
const EmbModelSpec *spec);
|
const EmbModelSpec *spec);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // LLAMAMODEL_H
|
#endif // LLAMACPP_BACKEND_IMPL_H
|
@ -130,7 +130,7 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
|
|||||||
|
|
||||||
addCudaSearchPath();
|
addCudaSearchPath();
|
||||||
|
|
||||||
std::string impl_name_re = "llamamodel-mainline-(cpu|metal|kompute|vulkan|cuda)";
|
std::string impl_name_re = "llamacpp-(cpu|metal|kompute|vulkan|cuda)";
|
||||||
if (cpu_supports_avx2() == 0) {
|
if (cpu_supports_avx2() == 0) {
|
||||||
impl_name_re += "-avxonly";
|
impl_name_re += "-avxonly";
|
||||||
}
|
}
|
||||||
@ -146,7 +146,10 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
|
|||||||
const fs::path &p = f.path();
|
const fs::path &p = f.path();
|
||||||
|
|
||||||
if (p.extension() != LIB_FILE_EXT) continue;
|
if (p.extension() != LIB_FILE_EXT) continue;
|
||||||
if (!std::regex_search(p.stem().string(), re)) continue;
|
if (!std::regex_search(p.stem().string(), re)) {
|
||||||
|
std::cerr << "did not match regex: " << p.stem().string() << "\n";
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// Add to list if model implementation
|
// Add to list if model implementation
|
||||||
Dlhandle dl;
|
Dlhandle dl;
|
||||||
|
@ -326,18 +326,18 @@ install(
|
|||||||
# to the this component's dir for the finicky qt installer to work
|
# to the this component's dir for the finicky qt installer to work
|
||||||
if (LLMODEL_KOMPUTE)
|
if (LLMODEL_KOMPUTE)
|
||||||
set(MODEL_IMPL_TARGETS
|
set(MODEL_IMPL_TARGETS
|
||||||
llamamodel-mainline-kompute
|
llamacpp-kompute
|
||||||
llamamodel-mainline-kompute-avxonly
|
llamacpp-kompute-avxonly
|
||||||
)
|
)
|
||||||
else()
|
else()
|
||||||
set(MODEL_IMPL_TARGETS
|
set(MODEL_IMPL_TARGETS
|
||||||
llamamodel-mainline-cpu
|
llamacpp-cpu
|
||||||
llamamodel-mainline-cpu-avxonly
|
llamacpp-cpu-avxonly
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (APPLE)
|
if (APPLE)
|
||||||
list(APPEND MODEL_IMPL_TARGETS llamamodel-mainline-metal)
|
list(APPEND MODEL_IMPL_TARGETS llamacpp-metal)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
install(
|
install(
|
||||||
@ -365,12 +365,12 @@ if(WIN32 AND GPT4ALL_SIGN_INSTALL)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLMODEL_CUDA)
|
if (LLMODEL_CUDA)
|
||||||
set_property(TARGET llamamodel-mainline-cuda llamamodel-mainline-cuda-avxonly
|
set_property(TARGET llamacpp-cuda llamacpp-cuda-avxonly
|
||||||
APPEND PROPERTY INSTALL_RPATH "$ORIGIN")
|
APPEND PROPERTY INSTALL_RPATH "$ORIGIN")
|
||||||
|
|
||||||
install(
|
install(
|
||||||
TARGETS llamamodel-mainline-cuda
|
TARGETS llamacpp-cuda
|
||||||
llamamodel-mainline-cuda-avxonly
|
llamacpp-cuda-avxonly
|
||||||
RUNTIME_DEPENDENCY_SET llama-cuda-deps
|
RUNTIME_DEPENDENCY_SET llama-cuda-deps
|
||||||
LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN} # .so/.dylib
|
LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN} # .so/.dylib
|
||||||
RUNTIME DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN} # .dll
|
RUNTIME DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN} # .dll
|
||||||
|
Loading…
Reference in New Issue
Block a user