Latest rebase on llama.cpp with gguf support.

This commit is contained in:
Adam Treat 2023-09-21 12:41:48 -04:00
parent 5f3d739205
commit d90d003a1d
12 changed files with 245 additions and 85 deletions

View File

@ -345,7 +345,7 @@ void bert_eval(
// embd norm
{
inpL = ggml_norm(ctx0, inpL);
inpL = ggml_norm(ctx0, inpL, 1e-5f);
inpL = ggml_add(ctx0,
ggml_mul(ctx0,
@ -406,7 +406,7 @@ void bert_eval(
// attention norm
{
cur = ggml_norm(ctx0, cur);
cur = ggml_norm(ctx0, cur, 1e-5f);
cur = ggml_add(ctx0,
ggml_mul(ctx0,
@ -432,7 +432,7 @@ void bert_eval(
// output norm
{
cur = ggml_norm(ctx0, cur);
cur = ggml_norm(ctx0, cur, 1e-5f);
cur = ggml_add(ctx0,
ggml_mul(ctx0,
@ -1038,13 +1038,16 @@ DLL_EXPORT const char *get_build_variant() {
return GGML_BUILD_VARIANT;
}
DLL_EXPORT bool magic_match(std::istream& f) {
DLL_EXPORT bool magic_match(const char* fname) {
#if 0
uint32_t magic = 0;
f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
if (magic != 0x62657274) {
return false;
}
return true;
#endif
return false;
}
DLL_EXPORT LLModel *construct() {

View File

@ -2,10 +2,11 @@
#define FALCON_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
#include "falcon_impl.h"
#include "llama.h"
#include "llama-util.h"
#include "utils.h"
#include "llmodel_shared.h"
#include <stdio.h>
#include <string.h>
#include <cassert>
#include <cinttypes>
#include <iostream>
@ -203,22 +204,22 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
const int n_vocab = hparams.n_vocab;
const int head_dim = hparams.n_embd / hparams.n_head;
ctx_size += ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab; // tok_embeddings
ctx_size += ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd; // output_norm
ctx_size += ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd; // output_norm_b
ctx_size += ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab; // lm_head
ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab; // tok_embeddings
ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd; // output_norm
ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd; // output_norm_b
ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab; // lm_head
// if (hparams.version == 40) { // Falcon-40B
// ctx_size += n_layer * ggml_sizeof_tensor_1d(GGML_TYPE_F32, n_embd); // attention_norm
// ctx_size += n_layer * ggml_sizeof_tensor_1d(GGML_TYPE_F32, n_embd); // attention_norm_b
// }
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd); // input_layernorm
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd); // input_layernorm_b
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * (n_head_kv * 2 + n_head) * head_dim); // query_key_value
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_embd); // wo
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_ff); // ffn_up
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_ff * n_embd); // ffn_down
ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd); // input_layernorm
ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd); // input_layernorm_b
ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * (n_head_kv * 2 + n_head) * head_dim); // query_key_value
ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_embd); // wo
ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_ff); // ffn_up
ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_ff * n_embd); // ffn_down
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
@ -494,7 +495,7 @@ bool falcon_eval(
// self-attention
{
layernorm_output = ggml_norm(ctx0, inpL);
layernorm_output = ggml_norm(ctx0, inpL, 1e-5f);
layernorm_output = ggml_add(ctx0,
ggml_mul(ctx0,
@ -653,7 +654,7 @@ bool falcon_eval(
// norm
{
inpL = ggml_norm(ctx0, inpL);
inpL = ggml_norm(ctx0, inpL, 1e-5f);
// inpL = ln_f_g*inpL + ln_f_b
inpL = ggml_add(ctx0,
@ -680,7 +681,7 @@ bool falcon_eval(
// run the computation
ggml_build_forward_expand(&gf, inpL);
ggml_graph_compute_g4a(model.work_buf, &gf, n_threads);
//if (n_past%100 == 0) {
// ggml_graph_print (&gf);
@ -954,13 +955,14 @@ DLL_EXPORT const char *get_build_variant() {
return GGML_BUILD_VARIANT;
}
DLL_EXPORT bool magic_match(std::istream& f) {
DLL_EXPORT bool magic_match(const char* fname) {
#if 0
uint32_t magic = 0;
f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
uint32_t version = 0;
f.read(reinterpret_cast<char*>(&version), sizeof(version));
if (magic != FALCON_MAGIC) {
return false;
return false;
}
falcon_hparams hparams;
f.read(reinterpret_cast<char*>(&hparams), sizeof(hparams));
@ -977,6 +979,8 @@ DLL_EXPORT bool magic_match(std::istream& f) {
return false;
}
return true;
#endif
return false;
}
DLL_EXPORT LLModel *construct() {

@ -1 +1 @@
Subproject commit 99c5c9a0d834888c33669855f3a1cf425df37dd2
Subproject commit 37a0be313d21f8b61184a3adcaac123353128238

View File

@ -185,7 +185,7 @@ if (LLAMA_KOMPUTE)
string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
if(CMAKE_GENERATOR MATCHES "Visual Studio")
if(CMAKE_GENERATOR MATCHES "Visual Studio")
add_custom_command(
OUTPUT ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
@ -346,6 +346,13 @@ endif()
# TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
if (MSVC)
string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
else ()
set(CMAKE_GENERATOR_PLATFORM_LWR "")
endif ()
if (NOT MSVC)
if (LLAMA_STATIC)
add_link_options(-static)
@ -361,6 +368,138 @@ if (NOT MSVC)
endif()
endif()
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
message(STATUS "ARM detected")
if (MSVC)
add_compile_definitions(__ARM_NEON)
add_compile_definitions(__ARM_FEATURE_FMA)
add_compile_definitions(__ARM_FEATURE_DOTPROD)
# add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
else()
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
add_compile_options(-mfp16-format=ieee)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
# Raspberry Pi 1, Zero
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
# Raspberry Pi 2
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
# Raspberry Pi 3, 4, Zero 2 (32-bit)
add_compile_options(-mno-unaligned-access)
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
message(STATUS "x86 detected")
if (MSVC)
if (LLAMA_AVX512)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
# MSVC has no compile-time flags enabling specific
# AVX512 extensions, neither it defines the
# macros corresponding to the extensions.
# Do it manually.
if (LLAMA_AVX512_VBMI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
endif()
if (LLAMA_AVX512_VNNI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif()
elseif (LLAMA_AVX2)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
elseif (LLAMA_AVX)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
endif()
else()
if (LLAMA_F16C)
add_compile_options(-mf16c)
endif()
if (LLAMA_FMA)
add_compile_options(-mfma)
endif()
if (LLAMA_AVX)
add_compile_options(-mavx)
endif()
if (LLAMA_AVX2)
add_compile_options(-mavx2)
endif()
if (LLAMA_AVX512)
add_compile_options(-mavx512f)
add_compile_options(-mavx512bw)
endif()
if (LLAMA_AVX512_VBMI)
add_compile_options(-mavx512vbmi)
endif()
if (LLAMA_AVX512_VNNI)
add_compile_options(-mavx512vnni)
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected")
add_compile_options(-mcpu=native -mtune=native)
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
else()
message(STATUS "Unknown architecture")
endif()
#
# POSIX conformance
#
# clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
# posix_memalign came in POSIX.1-2001 / SUSv3
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
add_compile_definitions(_XOPEN_SOURCE=600)
# Somehow in OpenBSD whenever POSIX conformance is specified
# some string functions rely on locale_t availability,
# which was introduced in POSIX.1-2008, forcing us to go higher
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
remove_definitions(-D_XOPEN_SOURCE=600)
add_compile_definitions(_XOPEN_SOURCE=700)
endif()
# Data types, macros and functions related to controlling CPU affinity and
# some memory allocation are available on Linux through GNU extensions in libc
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
add_compile_definitions(_GNU_SOURCE)
endif()
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
# and on macOS its availability depends on enabling Darwin extensions
# similarly on DragonFly, enabling BSD extensions is necessary
if (
CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
CMAKE_SYSTEM_NAME MATCHES "iOS" OR
CMAKE_SYSTEM_NAME MATCHES "tvOS" OR
CMAKE_SYSTEM_NAME MATCHES "DragonFly"
)
add_compile_definitions(_DARWIN_C_SOURCE)
endif()
# alloca is a non-standard interface that is not visible on BSDs when
# POSIX conformance is specified, but not all of them provide a clean way
# to enable it in such cases
if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
add_compile_definitions(__BSD_VISIBLE)
endif()
if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
add_compile_definitions(_NETBSD_SOURCE)
endif()
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
add_compile_definitions(_BSD_SOURCE)
endif()
function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
message(STATUS "Configuring ggml implementation target llama${SUFFIX} in ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}")
@ -468,15 +607,14 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
if (WITH_LLAMA)
# Backwards compatibility with old llama.cpp versions
set(LLAMA_UTIL_SOURCE_FILE llama-util.h)
# set(LLAMA_UTIL_SOURCE_FILE llama-util.h)
if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE})
set(LLAMA_UTIL_SOURCE_FILE llama_util.h)
endif()
add_library(llama${SUFFIX} STATIC
${DIRECTORY}/llama.cpp
${DIRECTORY}/llama.h
${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE})
${DIRECTORY}/llama.h)
if (LLAMA_METAL AND GGML_METAL_SOURCES)
target_compile_definitions(llama${SUFFIX} PUBLIC GGML_USE_METAL GGML_METAL_NDEBUG)

View File

@ -226,9 +226,9 @@ size_t LLamaModel::restoreState(const uint8_t *src)
std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str) const
{
const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos());
const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos(d_ptr->ctx));
std::vector<LLModel::Token> fres(str.size()+4);
auto fres_len = llama_tokenize(d_ptr->ctx, str.c_str(), fres.data(), fres.size(), useBOS);
auto fres_len = llama_tokenize(d_ptr->ctx, str.c_str(), str.length(), fres.data(), fres.size(), useBOS);
fres.resize(fres_len);
return fres;
}
@ -250,10 +250,10 @@ LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
{
// When we recalculate context we could have erased the original BOS token... we need to replace it
const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos());
const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos(d_ptr->ctx));
if (useBOS) {
std::vector<int32_t> myTokens;
myTokens.push_back(llama_token_bos());
myTokens.push_back(llama_token_bos(d_ptr->ctx));
myTokens.insert(myTokens.end(), tokens.begin(), tokens.end());
ctx.n_past += 1;
return llama_eval(d_ptr->ctx, myTokens.data(), myTokens.size(), ctx.n_past, d_ptr->n_threads) == 0;
@ -268,7 +268,7 @@ int32_t LLamaModel::contextLength() const
const std::vector<LLModel::Token> &LLamaModel::endTokens() const
{
static const std::vector<LLModel::Token> fres = {llama_token_eos()};
static const std::vector<LLModel::Token> fres = {llama_token_eos(d_ptr->ctx)};
return fres;
}
@ -351,6 +351,16 @@ bool LLamaModel::usingGPUDevice()
return false;
}
std::string get_arch_name(gguf_context *ctx_gguf) {
std::string arch_name;
const int kid = gguf_find_key(ctx_gguf, "general.architecture");
enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
if (ktype != (GGUF_TYPE_STRING)) {
throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
}
return gguf_get_val_str(ctx_gguf, kid);
}
#if defined(_WIN32)
#define DLL_EXPORT __declspec(dllexport)
#else
@ -370,39 +380,42 @@ DLL_EXPORT const char *get_build_variant() {
return GGML_BUILD_VARIANT;
}
DLL_EXPORT bool magic_match(std::istream& f) {
// Check magic
uint32_t magic = 0;
f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
if (magic != 0x67676a74) return false;
// Check version
uint32_t version = 0;
f.read(reinterpret_cast<char*>(&version), sizeof(version));
if (!(version LLAMA_VERSIONS)) {
DLL_EXPORT bool magic_match(const char * fname) {
struct ggml_context * ctx_meta = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_meta,
};
gguf_context *ctx_gguf = gguf_init_from_file(fname, params);
if (!ctx_gguf)
return false;
}
llama_file_hparams hparams;
f.read(reinterpret_cast<char*>(&hparams), sizeof(hparams));
if (!(hparams.n_vocab >= 32000 && hparams.n_vocab <= 32100)) {
return false; // not a llama.
}
bool isValid = gguf_get_version(ctx_gguf) <= 2;
isValid = get_arch_name(ctx_gguf) != "llama" ? false : isValid;
#ifdef GGML_USE_METAL
// Check quant supported on metal
// skip fields
switch(hparams.ftype) {
// currently supported on Metal https://github.com/ggerganov/llama.cpp/blob/ae9663f1887513e152839e91f61c513075a19422/ggml-metal.m#L51-L55
case LLAMA_FTYPE_MOSTLY_F16:
case LLAMA_FTYPE_MOSTLY_Q2_K:
case LLAMA_FTYPE_MOSTLY_Q4_0:
case LLAMA_FTYPE_MOSTLY_Q6_K:
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
case LLAMA_FTYPE_MOSTLY_Q4_K_M:
return true;
default: // unsupported quant-type for Metal
return false;
const int n_tensors = gguf_get_n_tensors(ctx_gguf);
for (int i = 0; i < n_tensors; i++) {
const char * name = gguf_get_tensor_name(ctx_gguf, i);
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name);
switch(meta->type) {
// currently supported on Metal https://github.com/ggerganov/llama.cpp/blob/ae9663f1887513e152839e91f61c513075a19422/ggml-metal.m#L51-L55
case LLAMA_FTYPE_MOSTLY_F16:
case LLAMA_FTYPE_MOSTLY_Q2_K:
case LLAMA_FTYPE_MOSTLY_Q4_0:
case LLAMA_FTYPE_MOSTLY_Q6_K:
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
case LLAMA_FTYPE_MOSTLY_Q4_K_M:
break;
default: // unsupported quant-type for Metal
isValid = false;
}
}
#endif
return true;
gguf_free(ctx_gguf);
return isValid;
}
DLL_EXPORT LLModel *construct() {

View File

@ -52,7 +52,7 @@ LLModel::Implementation::Implementation(Dlhandle &&dlhandle_)
auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
assert(get_build_variant);
m_buildVariant = get_build_variant();
m_magicMatch = m_dlhandle->get<bool(std::ifstream&)>("magic_match");
m_magicMatch = m_dlhandle->get<bool(const char*)>("magic_match");
assert(m_magicMatch);
m_construct = m_dlhandle->get<LLModel *()>("construct");
assert(m_construct);
@ -111,10 +111,9 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
return *libs;
}
const LLModel::Implementation* LLModel::Implementation::implementation(std::ifstream& f, const std::string& buildVariant) {
const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) {
for (const auto& i : implementationList()) {
f.seekg(0);
if (!i.m_magicMatch(f)) continue;
if (!i.m_magicMatch(fname)) continue;
if (buildVariant != i.m_buildVariant) continue;
return &i;
}
@ -126,9 +125,6 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
if (!has_at_least_minimal_hardware())
return nullptr;
// Read magic
std::ifstream f(modelPath, std::ios::binary);
if (!f) return nullptr;
// Get correct implementation
const Implementation* impl = nullptr;
@ -161,10 +157,9 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
buildVariant = "default";
}
}
impl = implementation(f, buildVariant);
impl = implementation(modelPath.c_str(), buildVariant);
if (!impl) return nullptr;
}
f.close();
// Construct and return llmodel implementation
auto fres = impl->m_construct();

View File

@ -27,13 +27,13 @@ public:
static bool isImplementation(const Dlhandle&);
static const std::vector<Implementation>& implementationList();
static const Implementation *implementation(std::ifstream& f, const std::string& buildVariant);
static const Implementation *implementation(const char *fname, const std::string& buildVariant);
static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto");
static void setImplementationsSearchPath(const std::string& path);
static const std::string& implementationsSearchPath();
private:
bool (*m_magicMatch)(std::ifstream& f);
bool (*m_magicMatch)(const char *fname);
LLModel *(*m_construct)();
private:

View File

@ -566,7 +566,7 @@ bool replit_eval(replit_model & model, const int n_threads, const int n_past,
// a = self.ln_1(x)
{
cur = ggml_norm(ctx0, inpL);
cur = ggml_norm(ctx0, inpL, 1e-5f);
cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_weight, cur), cur);
}
@ -658,7 +658,7 @@ bool replit_eval(replit_model & model, const int n_threads, const int n_past,
// m = self.ln_2(x)
{
cur = ggml_norm(ctx0, inpL);
cur = ggml_norm(ctx0, inpL, 1e-5f);
cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_weight, cur), cur);
}
@ -682,7 +682,7 @@ bool replit_eval(replit_model & model, const int n_threads, const int n_past,
ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, });
// norm
{
inpL = ggml_norm(ctx0, inpL);
inpL = ggml_norm(ctx0, inpL, 1e-5f);
// inpL = ln_f_g*inpL
inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.ln_f_weight, inpL), inpL);
}
@ -1002,7 +1002,8 @@ DLL_EXPORT const char *get_build_variant() {
return GGML_BUILD_VARIANT;
}
DLL_EXPORT bool magic_match(std::istream& f) {
DLL_EXPORT bool magic_match(const char *fname) {
#if 0
uint32_t magic = 0;
f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
if (magic != 0x7265706c) return false;
@ -1027,6 +1028,8 @@ DLL_EXPORT bool magic_match(std::istream& f) {
#else
return true;
#endif
#endif
return false;
}
DLL_EXPORT LLModel *construct() {

View File

@ -1,10 +1,11 @@
#define STARCODER_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
#include "starcoder_impl.h"
#include "llama.h"
#include "llama-util.h"
#include "utils.h"
#include "llmodel_shared.h"
#include <stdio.h>
#include <string.h>
#include <cassert>
#include <cinttypes>
#include <iostream>
@ -501,7 +502,7 @@ bool starcoder_eval(
// norm
{
// [ 768, N]
cur = ggml_norm(ctx0, inpL);
cur = ggml_norm(ctx0, inpL, 1e-5f);
// cur = ln_1_g*cur + ln_1_b
// [ 768, N]
@ -650,7 +651,7 @@ bool starcoder_eval(
{
// norm
{
cur = ggml_norm(ctx0, inpFF);
cur = ggml_norm(ctx0, inpFF, 1e-5f);
// cur = ln_2_g*cur + ln_2_b
// [ 768, N]
@ -707,7 +708,7 @@ bool starcoder_eval(
// norm
{
// [ 768, N]
inpL = ggml_norm(ctx0, inpL);
inpL = ggml_norm(ctx0, inpL, 1e-5f);
// inpL = ln_f_g*inpL + ln_f_b
// [ 768, N]
@ -1003,7 +1004,8 @@ DLL_EXPORT const char *get_build_variant() {
return GGML_BUILD_VARIANT;
}
DLL_EXPORT bool magic_match(std::istream& f) {
DLL_EXPORT bool magic_match(const char *fname) {
#if 0
uint32_t magic = 0;
f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
if (magic != STARCODER_MAGIC) {
@ -1015,6 +1017,8 @@ DLL_EXPORT bool magic_match(std::istream& f) {
return false;
}
return true;
#endif
return false;
}
DLL_EXPORT LLModel *construct() {

View File

@ -356,10 +356,10 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
emit modelLoadingError(QString("Could not find file for model %1").arg(modelInfo.filename()));
}
if (m_llModelInfo.model)
if (m_llModelInfo.model) {
setModelInfo(modelInfo);
processSystemPrompt();
processSystemPrompt();
}
return m_llModelInfo.model;
}

View File

@ -189,7 +189,7 @@ Window {
+ "causes include a bad file format, an incomplete or corrupted download, the wrong file "
+ "type, not enough system RAM or an incompatible model type. Here are some suggestions for resolving the problem:"
+ "<br><ul>"
+ "<li>Ensure the model file has a compatible ggml format and type"
+ "<li>Ensure the model file has a compatible format and type"
+ "<li>Check the model file is complete in the download folder"
+ "<li>You can find the download folder in the settings dialog"
+ "<li>If you've sideloaded the model ensure the file is not corrupt by checking md5sum"

View File

@ -796,7 +796,7 @@ void ModelList::updateModelsFromDirectory()
QString filename = it.fileName();
// All files that end with .bin and have 'ggml' somewhere in the name
if ((filename.endsWith(".bin") && filename.contains("ggml") && !filename.startsWith("incomplete"))
if (((filename.endsWith(".bin") || filename.endsWith(".gguf")) && (/*filename.contains("ggml") ||*/ filename.contains("gguf")) && !filename.startsWith("incomplete"))
|| (filename.endsWith(".txt") && filename.startsWith("chatgpt-"))) {
QString filePath = it.filePath();