From 9808be5e73268afb5f59879ae9ed4c230c237f5b Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Tue, 6 Aug 2024 16:47:58 -0400
Subject: [PATCH] rename LLamaModel to LlamaCppBackendImpl

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
---
 .gitmodules                                   |  2 +-
 gpt4all-backend/CMakeLists.txt                | 11 ++-
 .../{llama.cpp-mainline => llama.cpp}         |  0
 ...amamodel.cpp => llamacpp_backend_impl.cpp} | 76 +++++++++----------
 ...mamodel_impl.h => llamacpp_backend_impl.h} | 20 ++---
 gpt4all-backend/llmodel.cpp                   |  7 +-
 gpt4all-chat/CMakeLists.txt                   | 16 ++--
 7 files changed, 67 insertions(+), 65 deletions(-)
 rename gpt4all-backend/{llama.cpp-mainline => llama.cpp} (100%)
 rename gpt4all-backend/{llamamodel.cpp => llamacpp_backend_impl.cpp} (94%)
 rename gpt4all-backend/{llamamodel_impl.h => llamacpp_backend_impl.h} (83%)

diff --git a/.gitmodules b/.gitmodules
index 98c9a214..1ac9606d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,5 +1,5 @@
 [submodule "llama.cpp-mainline"]
-	path = gpt4all-backend/llama.cpp-mainline
+	path = gpt4all-backend/llama.cpp
 	url = https://github.com/nomic-ai/llama.cpp.git
 	branch = master
 [submodule "gpt4all-chat/usearch"]
diff --git a/gpt4all-backend/CMakeLists.txt b/gpt4all-backend/CMakeLists.txt
index e6210d74..aaa5e210 100644
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@@ -47,7 +47,7 @@ else()
     message(STATUS "Interprocedural optimization support detected")
 endif()
 
-set(DIRECTORY llama.cpp-mainline)
+set(DIRECTORY llama.cpp)
 include(llama.cpp.cmake)
 
 set(BUILD_VARIANTS)
@@ -108,7 +108,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
     endif()
 
     # Include GGML
-    include_ggml(-mainline-${BUILD_VARIANT})
+    include_ggml(-${BUILD_VARIANT})
 
     # Function for preparing individual implementations
     function(prepare_target TARGET_NAME BASE_LIB)
@@ -127,11 +127,10 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
     endfunction()
 
     # Add each individual implementations
-    add_library(llamamodel-mainline-${BUILD_VARIANT} SHARED
-        llamamodel.cpp llmodel_shared.cpp)
-    target_compile_definitions(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
+    add_library(llamacpp-${BUILD_VARIANT} SHARED llamacpp_backend_impl.cpp)
+    target_compile_definitions(llamacpp-${BUILD_VARIANT} PRIVATE
         LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
-    prepare_target(llamamodel-mainline llama-mainline)
+    prepare_target(llamacpp llama)
 
     if (NOT PROJECT_IS_TOP_LEVEL AND BUILD_VARIANT STREQUAL cuda)
         set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp
similarity index 100%
rename from gpt4all-backend/llama.cpp-mainline
rename to gpt4all-backend/llama.cpp
diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamacpp_backend_impl.cpp
similarity index 94%
rename from gpt4all-backend/llamamodel.cpp
rename to gpt4all-backend/llamacpp_backend_impl.cpp
index f07a05e8..aece51c6 100644
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamacpp_backend_impl.cpp
@@ -1,5 +1,5 @@
-#define LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
-#include "llamamodel_impl.h"
+#define LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
+#include "llamacpp_backend_impl.h"
 
 #include "llmodel.h"
 
@@ -232,7 +232,7 @@ cleanup:
     return value;
 }
 
-struct LLamaPrivate {
+struct LlamaPrivate {
     const std::string modelPath;
     bool modelLoaded = false;
     int device = -1;
@@ -246,8 +246,8 @@ struct LLamaPrivate {
     const char *backend_name = nullptr;
 };
 
-LLamaModel::LLamaModel()
-    : d_ptr(new LLamaPrivate) {}
+LlamaCppBackendImpl::LlamaCppBackendImpl()
+    : d_ptr(new LlamaPrivate) {}
 
 // default hparams (LLaMA 7B)
 struct llama_file_hparams {
@@ -260,7 +260,7 @@ struct llama_file_hparams {
     enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
 };
 
-size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
+size_t LlamaCppBackendImpl::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
 {
     // TODO(cebtenzzre): update to GGUF
     (void)ngl; // FIXME(cetenzzre): use this value
@@ -285,7 +285,7 @@ size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
     return filesize + est_kvcache_size;
 }
 
-bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const
+bool LlamaCppBackendImpl::isModelBlacklisted(const std::string &modelPath) const
 {
     auto * ctx = load_gguf(modelPath.c_str());
     if (!ctx) {
@@ -322,7 +322,7 @@ bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const
     return res;
 }
 
-bool LLamaModel::isEmbeddingModel(const std::string &modelPath) const
+bool LlamaCppBackendImpl::isEmbeddingModel(const std::string &modelPath) const
 {
     bool result = false;
     std::string arch;
@@ -346,7 +346,7 @@ cleanup:
     return result;
 }
 
-bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
+bool LlamaCppBackendImpl::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 {
     d_ptr->modelLoaded = false;
 
@@ -488,18 +488,18 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
     return true;
 }
 
-void LLamaModel::setThreadCount(int32_t n_threads)
+void LlamaCppBackendImpl::setThreadCount(int32_t n_threads)
 {
     d_ptr->n_threads = n_threads;
     llama_set_n_threads(d_ptr->ctx, n_threads, n_threads);
 }
 
-int32_t LLamaModel::threadCount() const
+int32_t LlamaCppBackendImpl::threadCount() const
 {
     return d_ptr->n_threads;
 }
 
-LLamaModel::~LLamaModel()
+LlamaCppBackendImpl::~LlamaCppBackendImpl()
 {
     if (d_ptr->ctx) {
         llama_free(d_ptr->ctx);
@@ -507,28 +507,28 @@ LLamaModel::~LLamaModel()
     llama_free_model(d_ptr->model);
 }
 
-bool LLamaModel::isModelLoaded() const
+bool LlamaCppBackendImpl::isModelLoaded() const
 {
     return d_ptr->modelLoaded;
 }
 
-size_t LLamaModel::stateSize() const
+size_t LlamaCppBackendImpl::stateSize() const
 {
     return llama_get_state_size(d_ptr->ctx);
 }
 
-size_t LLamaModel::saveState(uint8_t *dest) const
+size_t LlamaCppBackendImpl::saveState(uint8_t *dest) const
 {
     return llama_copy_state_data(d_ptr->ctx, dest);
 }
 
-size_t LLamaModel::restoreState(const uint8_t *src)
+size_t LlamaCppBackendImpl::restoreState(const uint8_t *src)
 {
     // const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540
     return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
 }
 
-std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str, bool special)
+std::vector<LLModel::Token> LlamaCppBackendImpl::tokenize(PromptContext &ctx, const std::string &str, bool special)
 {
     bool atStart = m_tokenize_last_token == -1;
     bool insertSpace = atStart || isSpecialToken(m_tokenize_last_token);
@@ -543,13 +543,13 @@ std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::
     return fres;
 }
 
-bool LLamaModel::isSpecialToken(Token id) const
+bool LlamaCppBackendImpl::isSpecialToken(Token id) const
 {
     return llama_token_get_attr(d_ptr->model, id)
         & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN);
 }
 
-std::string LLamaModel::tokenToString(Token id) const
+std::string LlamaCppBackendImpl::tokenToString(Token id) const
 {
     std::vector<char> result(8, 0);
     const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), 0, true);
@@ -565,7 +565,7 @@ std::string LLamaModel::tokenToString(Token id) const
     return std::string(result.data(), result.size());
 }
 
-LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
+LLModel::Token LlamaCppBackendImpl::sampleToken(PromptContext &promptCtx) const
 {
     const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
     return llama_sample_top_p_top_k(d_ptr->ctx,
@@ -574,7 +574,7 @@ LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
         promptCtx.repeat_penalty);
 }
 
-bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
+bool LlamaCppBackendImpl::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
 {
     llama_kv_cache_seq_rm(d_ptr->ctx, 0, ctx.n_past, -1);
 
@@ -598,7 +598,7 @@ bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &toke
     return res == 0;
 }
 
-void LLamaModel::shiftContext(PromptContext &promptCtx)
+void LlamaCppBackendImpl::shiftContext(PromptContext &promptCtx)
 {
     // infinite text generation via context shifting
 
@@ -622,27 +622,27 @@ void LLamaModel::shiftContext(PromptContext &promptCtx)
     promptCtx.n_past = promptCtx.tokens.size();
 }
 
-int32_t LLamaModel::contextLength() const
+int32_t LlamaCppBackendImpl::contextLength() const
 {
     return llama_n_ctx(d_ptr->ctx);
 }
 
-const std::vector<LLModel::Token> &LLamaModel::endTokens() const
+const std::vector<LLModel::Token> &LlamaCppBackendImpl::endTokens() const
 {
     return d_ptr->end_tokens;
 }
 
-bool LLamaModel::shouldAddBOS() const
+bool LlamaCppBackendImpl::shouldAddBOS() const
 {
     return llama_add_bos_token(d_ptr->model);
 }
 
-int32_t LLamaModel::maxContextLength(std::string const &modelPath) const
+int32_t LlamaCppBackendImpl::maxContextLength(std::string const &modelPath) const
 {
     return get_arch_key_u32(modelPath, "context_length");
 }
 
-int32_t LLamaModel::layerCount(std::string const &modelPath) const
+int32_t LlamaCppBackendImpl::layerCount(std::string const &modelPath) const
 {
     return get_arch_key_u32(modelPath, "block_count");
 }
@@ -659,7 +659,7 @@ static const char *getVulkanVendorName(uint32_t vendorID)
 }
 #endif
 
-std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired) const
+std::vector<LLModel::GPUDevice> LlamaCppBackendImpl::availableGPUDevices(size_t memoryRequired) const
 {
 #if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
     size_t count = 0;
@@ -724,7 +724,7 @@ std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryReq
     return {};
 }
 
-bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
+bool LlamaCppBackendImpl::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
 {
 #if defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
     auto devices = availableGPUDevices(memoryRequired);
@@ -761,7 +761,7 @@ bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &n
     return false;
 }
 
-bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) const
+bool LlamaCppBackendImpl::initializeGPUDevice(int device, std::string *unavail_reason) const
 {
 #if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
     (void)unavail_reason;
@@ -779,7 +779,7 @@ bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) co
 #endif
 }
 
-bool LLamaModel::usingGPUDevice() const
+bool LlamaCppBackendImpl::usingGPUDevice() const
 {
     if (!d_ptr->model)
         return false;
@@ -791,12 +791,12 @@ bool LLamaModel::usingGPUDevice() const
     return usingGPU;
 }
 
-const char *LLamaModel::backendName() const
+const char *LlamaCppBackendImpl::backendName() const
 {
     return d_ptr->backend_name;
 }
 
-const char *LLamaModel::gpuDeviceName() const
+const char *LlamaCppBackendImpl::gpuDeviceName() const
 {
     if (usingGPUDevice()) {
 #if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
@@ -832,7 +832,7 @@ static void batch_add_seq(llama_batch &batch, const std::vector<LLModel::Token>
     }
 }
 
-size_t LLamaModel::embeddingSize() const
+size_t LlamaCppBackendImpl::embeddingSize() const
 {
     return llama_n_embd(d_ptr->model);
 }
@@ -895,7 +895,7 @@ static const EmbModelSpec *getEmbedSpec(const std::string &modelName) {
     return it < std::end(specs) ? &it->spec : nullptr;
 }
 
-void LLamaModel::embed(
+void LlamaCppBackendImpl::embed(
     const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
     bool doMean, bool atlas
 ) {
@@ -907,7 +907,7 @@ void LLamaModel::embed(
     embed(texts, embeddings, prefix, dimensionality, tokenCount, doMean, atlas);
 }
 
-void LLamaModel::embed(
+void LlamaCppBackendImpl::embed(
     const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
     size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb
 ) {
@@ -965,7 +965,7 @@ double getL2NormScale(T *start, T *end)
     return 1.0 / std::max(magnitude, 1e-12);
 }
 
-void LLamaModel::embedInternal(
+void LlamaCppBackendImpl::embedInternal(
     const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
     size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb, const EmbModelSpec *spec
 ) {
@@ -1223,6 +1223,6 @@ DLL_EXPORT LLModel *construct()
 #ifdef GGML_USE_CUDA
     ggml_backend_cuda_log_set_callback(cuda_log_callback, nullptr);
 #endif
-    return new LLamaModel;
+    return new LlamaCppBackendImpl;
 }
 }
diff --git a/gpt4all-backend/llamamodel_impl.h b/gpt4all-backend/llamacpp_backend_impl.h
similarity index 83%
rename from gpt4all-backend/llamamodel_impl.h
rename to gpt4all-backend/llamacpp_backend_impl.h
index 7c698ffa..5923572f 100644
--- a/gpt4all-backend/llamamodel_impl.h
+++ b/gpt4all-backend/llamacpp_backend_impl.h
@@ -1,8 +1,8 @@
-#ifndef LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
-#error This file is NOT meant to be included outside of llamamodel.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
+#ifndef LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
+#error This file is NOT meant to be included outside of llamacpp_backend_impl.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
 #endif
-#ifndef LLAMAMODEL_H
-#define LLAMAMODEL_H
+#ifndef LLAMACPP_BACKEND_IMPL_H
+#define LLAMACPP_BACKEND_IMPL_H
 
 #include "llmodel.h"
 
@@ -10,13 +10,13 @@
 #include <string>
 #include <vector>
 
-struct LLamaPrivate;
+struct LlamaPrivate;
 struct EmbModelSpec;
 
-class LLamaModel : public LLModel {
+class LlamaCppBackendImpl : public LLModel {
 public:
-    LLamaModel();
-    ~LLamaModel();
+    LlamaCppBackendImpl();
+    ~LlamaCppBackendImpl();
 
     bool supportsEmbedding() const override { return m_supportsEmbedding; }
     bool supportsCompletion() const override { return m_supportsCompletion; }
@@ -47,7 +47,7 @@ public:
                size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
 
 private:
-    std::unique_ptr<LLamaPrivate> d_ptr;
+    std::unique_ptr<LlamaPrivate> d_ptr;
     bool m_supportsEmbedding = false;
     bool m_supportsCompletion = false;
 
@@ -69,4 +69,4 @@ protected:
                        const EmbModelSpec *spec);
 };
 
-#endif // LLAMAMODEL_H
+#endif // LLAMACPP_BACKEND_IMPL_H
diff --git a/gpt4all-backend/llmodel.cpp b/gpt4all-backend/llmodel.cpp
index 1acf0642..7b18004a 100644
--- a/gpt4all-backend/llmodel.cpp
+++ b/gpt4all-backend/llmodel.cpp
@@ -130,7 +130,7 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
 
         addCudaSearchPath();
 
-        std::string impl_name_re = "llamamodel-mainline-(cpu|metal|kompute|vulkan|cuda)";
+        std::string impl_name_re = "llamacpp-(cpu|metal|kompute|vulkan|cuda)";
         if (cpu_supports_avx2() == 0) {
             impl_name_re += "-avxonly";
         }
@@ -146,7 +146,10 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
                     const fs::path &p = f.path();
 
                     if (p.extension() != LIB_FILE_EXT) continue;
-                    if (!std::regex_search(p.stem().string(), re)) continue;
+                    if (!std::regex_search(p.stem().string(), re)) {
+                        std::cerr << "did not match regex: " << p.stem().string() << "\n";
+                        continue;
+                    }
 
                     // Add to list if model implementation
                     Dlhandle dl;
diff --git a/gpt4all-chat/CMakeLists.txt b/gpt4all-chat/CMakeLists.txt
index 07acef15..bada68c3 100644
--- a/gpt4all-chat/CMakeLists.txt
+++ b/gpt4all-chat/CMakeLists.txt
@@ -326,18 +326,18 @@ install(
 # to the this component's dir for the finicky qt installer to work
 if (LLMODEL_KOMPUTE)
     set(MODEL_IMPL_TARGETS
-        llamamodel-mainline-kompute
-        llamamodel-mainline-kompute-avxonly
+        llamacpp-kompute
+        llamacpp-kompute-avxonly
     )
 else()
     set(MODEL_IMPL_TARGETS
-        llamamodel-mainline-cpu
-        llamamodel-mainline-cpu-avxonly
+        llamacpp-cpu
+        llamacpp-cpu-avxonly
     )
 endif()
 
 if (APPLE)
-    list(APPEND MODEL_IMPL_TARGETS llamamodel-mainline-metal)
+    list(APPEND MODEL_IMPL_TARGETS llamacpp-metal)
 endif()
 
 install(
@@ -365,12 +365,12 @@ if(WIN32 AND GPT4ALL_SIGN_INSTALL)
 endif()
 
 if (LLMODEL_CUDA)
-    set_property(TARGET llamamodel-mainline-cuda llamamodel-mainline-cuda-avxonly
+    set_property(TARGET llamacpp-cuda llamacpp-cuda-avxonly
                  APPEND PROPERTY INSTALL_RPATH "$ORIGIN")
 
     install(
-        TARGETS llamamodel-mainline-cuda
-                llamamodel-mainline-cuda-avxonly
+        TARGETS llamacpp-cuda
+                llamacpp-cuda-avxonly
         RUNTIME_DEPENDENCY_SET llama-cuda-deps
         LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .so/.dylib
         RUNTIME DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .dll