mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2024-10-01 01:06:10 -04:00
Latest rebase on llama.cpp with gguf support.
This commit is contained in:
parent
5f3d739205
commit
d90d003a1d
@ -345,7 +345,7 @@ void bert_eval(
|
||||
|
||||
// embd norm
|
||||
{
|
||||
inpL = ggml_norm(ctx0, inpL);
|
||||
inpL = ggml_norm(ctx0, inpL, 1e-5f);
|
||||
|
||||
inpL = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
@ -406,7 +406,7 @@ void bert_eval(
|
||||
|
||||
// attention norm
|
||||
{
|
||||
cur = ggml_norm(ctx0, cur);
|
||||
cur = ggml_norm(ctx0, cur, 1e-5f);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
@ -432,7 +432,7 @@ void bert_eval(
|
||||
|
||||
// output norm
|
||||
{
|
||||
cur = ggml_norm(ctx0, cur);
|
||||
cur = ggml_norm(ctx0, cur, 1e-5f);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
@ -1038,13 +1038,16 @@ DLL_EXPORT const char *get_build_variant() {
|
||||
return GGML_BUILD_VARIANT;
|
||||
}
|
||||
|
||||
DLL_EXPORT bool magic_match(std::istream& f) {
|
||||
DLL_EXPORT bool magic_match(const char* fname) {
|
||||
#if 0
|
||||
uint32_t magic = 0;
|
||||
f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
|
||||
if (magic != 0x62657274) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
DLL_EXPORT LLModel *construct() {
|
||||
|
@ -2,10 +2,11 @@
|
||||
#define FALCON_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||
#include "falcon_impl.h"
|
||||
#include "llama.h"
|
||||
#include "llama-util.h"
|
||||
#include "utils.h"
|
||||
#include "llmodel_shared.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <iostream>
|
||||
@ -203,22 +204,22 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
const int head_dim = hparams.n_embd / hparams.n_head;
|
||||
|
||||
ctx_size += ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab; // tok_embeddings
|
||||
ctx_size += ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd; // output_norm
|
||||
ctx_size += ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd; // output_norm_b
|
||||
ctx_size += ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab; // lm_head
|
||||
ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab; // tok_embeddings
|
||||
ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd; // output_norm
|
||||
ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd; // output_norm_b
|
||||
ctx_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab; // lm_head
|
||||
|
||||
// if (hparams.version == 40) { // Falcon-40B
|
||||
// ctx_size += n_layer * ggml_sizeof_tensor_1d(GGML_TYPE_F32, n_embd); // attention_norm
|
||||
// ctx_size += n_layer * ggml_sizeof_tensor_1d(GGML_TYPE_F32, n_embd); // attention_norm_b
|
||||
// }
|
||||
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd); // input_layernorm
|
||||
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd); // input_layernorm_b
|
||||
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * (n_head_kv * 2 + n_head) * head_dim); // query_key_value
|
||||
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_embd); // wo
|
||||
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_ff); // ffn_up
|
||||
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_ff * n_embd); // ffn_down
|
||||
|
||||
ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd); // input_layernorm
|
||||
ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd); // input_layernorm_b
|
||||
ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * (n_head_kv * 2 + n_head) * head_dim); // query_key_value
|
||||
ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_embd); // wo
|
||||
ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_ff); // ffn_up
|
||||
ctx_size += n_layer * (GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_ff * n_embd); // ffn_down
|
||||
|
||||
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
||||
}
|
||||
|
||||
@ -494,7 +495,7 @@ bool falcon_eval(
|
||||
|
||||
// self-attention
|
||||
{
|
||||
layernorm_output = ggml_norm(ctx0, inpL);
|
||||
layernorm_output = ggml_norm(ctx0, inpL, 1e-5f);
|
||||
|
||||
layernorm_output = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
@ -653,7 +654,7 @@ bool falcon_eval(
|
||||
|
||||
// norm
|
||||
{
|
||||
inpL = ggml_norm(ctx0, inpL);
|
||||
inpL = ggml_norm(ctx0, inpL, 1e-5f);
|
||||
|
||||
// inpL = ln_f_g*inpL + ln_f_b
|
||||
inpL = ggml_add(ctx0,
|
||||
@ -680,7 +681,7 @@ bool falcon_eval(
|
||||
// run the computation
|
||||
ggml_build_forward_expand(&gf, inpL);
|
||||
ggml_graph_compute_g4a(model.work_buf, &gf, n_threads);
|
||||
|
||||
|
||||
|
||||
//if (n_past%100 == 0) {
|
||||
// ggml_graph_print (&gf);
|
||||
@ -954,13 +955,14 @@ DLL_EXPORT const char *get_build_variant() {
|
||||
return GGML_BUILD_VARIANT;
|
||||
}
|
||||
|
||||
DLL_EXPORT bool magic_match(std::istream& f) {
|
||||
DLL_EXPORT bool magic_match(const char* fname) {
|
||||
#if 0
|
||||
uint32_t magic = 0;
|
||||
f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
|
||||
uint32_t version = 0;
|
||||
f.read(reinterpret_cast<char*>(&version), sizeof(version));
|
||||
if (magic != FALCON_MAGIC) {
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
falcon_hparams hparams;
|
||||
f.read(reinterpret_cast<char*>(&hparams), sizeof(hparams));
|
||||
@ -977,6 +979,8 @@ DLL_EXPORT bool magic_match(std::istream& f) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
DLL_EXPORT LLModel *construct() {
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit 99c5c9a0d834888c33669855f3a1cf425df37dd2
|
||||
Subproject commit 37a0be313d21f8b61184a3adcaac123353128238
|
@ -185,7 +185,7 @@ if (LLAMA_KOMPUTE)
|
||||
string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
|
||||
set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
|
||||
message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
|
||||
if(CMAKE_GENERATOR MATCHES "Visual Studio")
|
||||
if(CMAKE_GENERATOR MATCHES "Visual Studio")
|
||||
add_custom_command(
|
||||
OUTPUT ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
|
||||
@ -346,6 +346,13 @@ endif()
|
||||
# TODO: probably these flags need to be tweaked on some architectures
|
||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
if (MSVC)
|
||||
string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
|
||||
message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
|
||||
else ()
|
||||
set(CMAKE_GENERATOR_PLATFORM_LWR "")
|
||||
endif ()
|
||||
|
||||
if (NOT MSVC)
|
||||
if (LLAMA_STATIC)
|
||||
add_link_options(-static)
|
||||
@ -361,6 +368,138 @@ if (NOT MSVC)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
|
||||
message(STATUS "ARM detected")
|
||||
if (MSVC)
|
||||
add_compile_definitions(__ARM_NEON)
|
||||
add_compile_definitions(__ARM_FEATURE_FMA)
|
||||
add_compile_definitions(__ARM_FEATURE_DOTPROD)
|
||||
# add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16
|
||||
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
|
||||
else()
|
||||
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
||||
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
||||
add_compile_options(-mfp16-format=ieee)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
||||
# Raspberry Pi 1, Zero
|
||||
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
||||
# Raspberry Pi 2
|
||||
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||
add_compile_options(-mno-unaligned-access)
|
||||
endif()
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
|
||||
message(STATUS "x86 detected")
|
||||
if (MSVC)
|
||||
if (LLAMA_AVX512)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
|
||||
# MSVC has no compile-time flags enabling specific
|
||||
# AVX512 extensions, neither it defines the
|
||||
# macros corresponding to the extensions.
|
||||
# Do it manually.
|
||||
if (LLAMA_AVX512_VBMI)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VNNI)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
||||
endif()
|
||||
elseif (LLAMA_AVX2)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
|
||||
elseif (LLAMA_AVX)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
|
||||
endif()
|
||||
else()
|
||||
if (LLAMA_F16C)
|
||||
add_compile_options(-mf16c)
|
||||
endif()
|
||||
if (LLAMA_FMA)
|
||||
add_compile_options(-mfma)
|
||||
endif()
|
||||
if (LLAMA_AVX)
|
||||
add_compile_options(-mavx)
|
||||
endif()
|
||||
if (LLAMA_AVX2)
|
||||
add_compile_options(-mavx2)
|
||||
endif()
|
||||
if (LLAMA_AVX512)
|
||||
add_compile_options(-mavx512f)
|
||||
add_compile_options(-mavx512bw)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VBMI)
|
||||
add_compile_options(-mavx512vbmi)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VNNI)
|
||||
add_compile_options(-mavx512vnni)
|
||||
endif()
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
message(STATUS "PowerPC detected")
|
||||
add_compile_options(-mcpu=native -mtune=native)
|
||||
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
||||
else()
|
||||
message(STATUS "Unknown architecture")
|
||||
endif()
|
||||
|
||||
#
|
||||
# POSIX conformance
|
||||
#
|
||||
|
||||
# clock_gettime came in POSIX.1b (1993)
|
||||
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
|
||||
# posix_memalign came in POSIX.1-2001 / SUSv3
|
||||
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
|
||||
add_compile_definitions(_XOPEN_SOURCE=600)
|
||||
|
||||
# Somehow in OpenBSD whenever POSIX conformance is specified
|
||||
# some string functions rely on locale_t availability,
|
||||
# which was introduced in POSIX.1-2008, forcing us to go higher
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
|
||||
remove_definitions(-D_XOPEN_SOURCE=600)
|
||||
add_compile_definitions(_XOPEN_SOURCE=700)
|
||||
endif()
|
||||
|
||||
# Data types, macros and functions related to controlling CPU affinity and
|
||||
# some memory allocation are available on Linux through GNU extensions in libc
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
add_compile_definitions(_GNU_SOURCE)
|
||||
endif()
|
||||
|
||||
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
||||
# and on macOS its availability depends on enabling Darwin extensions
|
||||
# similarly on DragonFly, enabling BSD extensions is necessary
|
||||
if (
|
||||
CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
|
||||
CMAKE_SYSTEM_NAME MATCHES "iOS" OR
|
||||
CMAKE_SYSTEM_NAME MATCHES "tvOS" OR
|
||||
CMAKE_SYSTEM_NAME MATCHES "DragonFly"
|
||||
)
|
||||
add_compile_definitions(_DARWIN_C_SOURCE)
|
||||
endif()
|
||||
|
||||
# alloca is a non-standard interface that is not visible on BSDs when
|
||||
# POSIX conformance is specified, but not all of them provide a clean way
|
||||
# to enable it in such cases
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
|
||||
add_compile_definitions(__BSD_VISIBLE)
|
||||
endif()
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
|
||||
add_compile_definitions(_NETBSD_SOURCE)
|
||||
endif()
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
|
||||
add_compile_definitions(_BSD_SOURCE)
|
||||
endif()
|
||||
|
||||
function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
|
||||
message(STATUS "Configuring ggml implementation target llama${SUFFIX} in ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}")
|
||||
|
||||
@ -468,15 +607,14 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
|
||||
|
||||
if (WITH_LLAMA)
|
||||
# Backwards compatibility with old llama.cpp versions
|
||||
set(LLAMA_UTIL_SOURCE_FILE llama-util.h)
|
||||
# set(LLAMA_UTIL_SOURCE_FILE llama-util.h)
|
||||
if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE})
|
||||
set(LLAMA_UTIL_SOURCE_FILE llama_util.h)
|
||||
endif()
|
||||
|
||||
add_library(llama${SUFFIX} STATIC
|
||||
${DIRECTORY}/llama.cpp
|
||||
${DIRECTORY}/llama.h
|
||||
${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE})
|
||||
${DIRECTORY}/llama.h)
|
||||
|
||||
if (LLAMA_METAL AND GGML_METAL_SOURCES)
|
||||
target_compile_definitions(llama${SUFFIX} PUBLIC GGML_USE_METAL GGML_METAL_NDEBUG)
|
||||
|
@ -226,9 +226,9 @@ size_t LLamaModel::restoreState(const uint8_t *src)
|
||||
|
||||
std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str) const
|
||||
{
|
||||
const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos());
|
||||
const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos(d_ptr->ctx));
|
||||
std::vector<LLModel::Token> fres(str.size()+4);
|
||||
auto fres_len = llama_tokenize(d_ptr->ctx, str.c_str(), fres.data(), fres.size(), useBOS);
|
||||
auto fres_len = llama_tokenize(d_ptr->ctx, str.c_str(), str.length(), fres.data(), fres.size(), useBOS);
|
||||
fres.resize(fres_len);
|
||||
return fres;
|
||||
}
|
||||
@ -250,10 +250,10 @@ LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
|
||||
bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
|
||||
{
|
||||
// When we recalculate context we could have erased the original BOS token... we need to replace it
|
||||
const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos());
|
||||
const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos(d_ptr->ctx));
|
||||
if (useBOS) {
|
||||
std::vector<int32_t> myTokens;
|
||||
myTokens.push_back(llama_token_bos());
|
||||
myTokens.push_back(llama_token_bos(d_ptr->ctx));
|
||||
myTokens.insert(myTokens.end(), tokens.begin(), tokens.end());
|
||||
ctx.n_past += 1;
|
||||
return llama_eval(d_ptr->ctx, myTokens.data(), myTokens.size(), ctx.n_past, d_ptr->n_threads) == 0;
|
||||
@ -268,7 +268,7 @@ int32_t LLamaModel::contextLength() const
|
||||
|
||||
const std::vector<LLModel::Token> &LLamaModel::endTokens() const
|
||||
{
|
||||
static const std::vector<LLModel::Token> fres = {llama_token_eos()};
|
||||
static const std::vector<LLModel::Token> fres = {llama_token_eos(d_ptr->ctx)};
|
||||
return fres;
|
||||
}
|
||||
|
||||
@ -351,6 +351,16 @@ bool LLamaModel::usingGPUDevice()
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string get_arch_name(gguf_context *ctx_gguf) {
|
||||
std::string arch_name;
|
||||
const int kid = gguf_find_key(ctx_gguf, "general.architecture");
|
||||
enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
|
||||
if (ktype != (GGUF_TYPE_STRING)) {
|
||||
throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
|
||||
}
|
||||
return gguf_get_val_str(ctx_gguf, kid);
|
||||
}
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define DLL_EXPORT __declspec(dllexport)
|
||||
#else
|
||||
@ -370,39 +380,42 @@ DLL_EXPORT const char *get_build_variant() {
|
||||
return GGML_BUILD_VARIANT;
|
||||
}
|
||||
|
||||
DLL_EXPORT bool magic_match(std::istream& f) {
|
||||
// Check magic
|
||||
uint32_t magic = 0;
|
||||
f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
|
||||
if (magic != 0x67676a74) return false;
|
||||
// Check version
|
||||
uint32_t version = 0;
|
||||
f.read(reinterpret_cast<char*>(&version), sizeof(version));
|
||||
if (!(version LLAMA_VERSIONS)) {
|
||||
DLL_EXPORT bool magic_match(const char * fname) {
|
||||
|
||||
struct ggml_context * ctx_meta = NULL;
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx_meta,
|
||||
};
|
||||
gguf_context *ctx_gguf = gguf_init_from_file(fname, params);
|
||||
if (!ctx_gguf)
|
||||
return false;
|
||||
}
|
||||
llama_file_hparams hparams;
|
||||
f.read(reinterpret_cast<char*>(&hparams), sizeof(hparams));
|
||||
if (!(hparams.n_vocab >= 32000 && hparams.n_vocab <= 32100)) {
|
||||
return false; // not a llama.
|
||||
}
|
||||
|
||||
bool isValid = gguf_get_version(ctx_gguf) <= 2;
|
||||
isValid = get_arch_name(ctx_gguf) != "llama" ? false : isValid;
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
// Check quant supported on metal
|
||||
// skip fields
|
||||
switch(hparams.ftype) {
|
||||
// currently supported on Metal https://github.com/ggerganov/llama.cpp/blob/ae9663f1887513e152839e91f61c513075a19422/ggml-metal.m#L51-L55
|
||||
case LLAMA_FTYPE_MOSTLY_F16:
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K:
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0:
|
||||
case LLAMA_FTYPE_MOSTLY_Q6_K:
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_M:
|
||||
return true;
|
||||
default: // unsupported quant-type for Metal
|
||||
return false;
|
||||
const int n_tensors = gguf_get_n_tensors(ctx_gguf);
|
||||
for (int i = 0; i < n_tensors; i++) {
|
||||
const char * name = gguf_get_tensor_name(ctx_gguf, i);
|
||||
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name);
|
||||
switch(meta->type) {
|
||||
// currently supported on Metal https://github.com/ggerganov/llama.cpp/blob/ae9663f1887513e152839e91f61c513075a19422/ggml-metal.m#L51-L55
|
||||
case LLAMA_FTYPE_MOSTLY_F16:
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K:
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0:
|
||||
case LLAMA_FTYPE_MOSTLY_Q6_K:
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_M:
|
||||
break;
|
||||
default: // unsupported quant-type for Metal
|
||||
isValid = false;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return true;
|
||||
|
||||
gguf_free(ctx_gguf);
|
||||
return isValid;
|
||||
}
|
||||
|
||||
DLL_EXPORT LLModel *construct() {
|
||||
|
@ -52,7 +52,7 @@ LLModel::Implementation::Implementation(Dlhandle &&dlhandle_)
|
||||
auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
|
||||
assert(get_build_variant);
|
||||
m_buildVariant = get_build_variant();
|
||||
m_magicMatch = m_dlhandle->get<bool(std::ifstream&)>("magic_match");
|
||||
m_magicMatch = m_dlhandle->get<bool(const char*)>("magic_match");
|
||||
assert(m_magicMatch);
|
||||
m_construct = m_dlhandle->get<LLModel *()>("construct");
|
||||
assert(m_construct);
|
||||
@ -111,10 +111,9 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
|
||||
return *libs;
|
||||
}
|
||||
|
||||
const LLModel::Implementation* LLModel::Implementation::implementation(std::ifstream& f, const std::string& buildVariant) {
|
||||
const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) {
|
||||
for (const auto& i : implementationList()) {
|
||||
f.seekg(0);
|
||||
if (!i.m_magicMatch(f)) continue;
|
||||
if (!i.m_magicMatch(fname)) continue;
|
||||
if (buildVariant != i.m_buildVariant) continue;
|
||||
return &i;
|
||||
}
|
||||
@ -126,9 +125,6 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
|
||||
if (!has_at_least_minimal_hardware())
|
||||
return nullptr;
|
||||
|
||||
// Read magic
|
||||
std::ifstream f(modelPath, std::ios::binary);
|
||||
if (!f) return nullptr;
|
||||
// Get correct implementation
|
||||
const Implementation* impl = nullptr;
|
||||
|
||||
@ -161,10 +157,9 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
|
||||
buildVariant = "default";
|
||||
}
|
||||
}
|
||||
impl = implementation(f, buildVariant);
|
||||
impl = implementation(modelPath.c_str(), buildVariant);
|
||||
if (!impl) return nullptr;
|
||||
}
|
||||
f.close();
|
||||
|
||||
// Construct and return llmodel implementation
|
||||
auto fres = impl->m_construct();
|
||||
|
@ -27,13 +27,13 @@ public:
|
||||
|
||||
static bool isImplementation(const Dlhandle&);
|
||||
static const std::vector<Implementation>& implementationList();
|
||||
static const Implementation *implementation(std::ifstream& f, const std::string& buildVariant);
|
||||
static const Implementation *implementation(const char *fname, const std::string& buildVariant);
|
||||
static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto");
|
||||
static void setImplementationsSearchPath(const std::string& path);
|
||||
static const std::string& implementationsSearchPath();
|
||||
|
||||
private:
|
||||
bool (*m_magicMatch)(std::ifstream& f);
|
||||
bool (*m_magicMatch)(const char *fname);
|
||||
LLModel *(*m_construct)();
|
||||
|
||||
private:
|
||||
|
@ -566,7 +566,7 @@ bool replit_eval(replit_model & model, const int n_threads, const int n_past,
|
||||
|
||||
// a = self.ln_1(x)
|
||||
{
|
||||
cur = ggml_norm(ctx0, inpL);
|
||||
cur = ggml_norm(ctx0, inpL, 1e-5f);
|
||||
|
||||
cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_weight, cur), cur);
|
||||
}
|
||||
@ -658,7 +658,7 @@ bool replit_eval(replit_model & model, const int n_threads, const int n_past,
|
||||
|
||||
// m = self.ln_2(x)
|
||||
{
|
||||
cur = ggml_norm(ctx0, inpL);
|
||||
cur = ggml_norm(ctx0, inpL, 1e-5f);
|
||||
|
||||
cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_weight, cur), cur);
|
||||
}
|
||||
@ -682,7 +682,7 @@ bool replit_eval(replit_model & model, const int n_threads, const int n_past,
|
||||
ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, });
|
||||
// norm
|
||||
{
|
||||
inpL = ggml_norm(ctx0, inpL);
|
||||
inpL = ggml_norm(ctx0, inpL, 1e-5f);
|
||||
// inpL = ln_f_g*inpL
|
||||
inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.ln_f_weight, inpL), inpL);
|
||||
}
|
||||
@ -1002,7 +1002,8 @@ DLL_EXPORT const char *get_build_variant() {
|
||||
return GGML_BUILD_VARIANT;
|
||||
}
|
||||
|
||||
DLL_EXPORT bool magic_match(std::istream& f) {
|
||||
DLL_EXPORT bool magic_match(const char *fname) {
|
||||
#if 0
|
||||
uint32_t magic = 0;
|
||||
f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
|
||||
if (magic != 0x7265706c) return false;
|
||||
@ -1027,6 +1028,8 @@ DLL_EXPORT bool magic_match(std::istream& f) {
|
||||
#else
|
||||
return true;
|
||||
#endif
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
DLL_EXPORT LLModel *construct() {
|
||||
|
@ -1,10 +1,11 @@
|
||||
#define STARCODER_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||
#include "starcoder_impl.h"
|
||||
#include "llama.h"
|
||||
#include "llama-util.h"
|
||||
#include "utils.h"
|
||||
#include "llmodel_shared.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <iostream>
|
||||
@ -501,7 +502,7 @@ bool starcoder_eval(
|
||||
// norm
|
||||
{
|
||||
// [ 768, N]
|
||||
cur = ggml_norm(ctx0, inpL);
|
||||
cur = ggml_norm(ctx0, inpL, 1e-5f);
|
||||
|
||||
// cur = ln_1_g*cur + ln_1_b
|
||||
// [ 768, N]
|
||||
@ -650,7 +651,7 @@ bool starcoder_eval(
|
||||
{
|
||||
// norm
|
||||
{
|
||||
cur = ggml_norm(ctx0, inpFF);
|
||||
cur = ggml_norm(ctx0, inpFF, 1e-5f);
|
||||
|
||||
// cur = ln_2_g*cur + ln_2_b
|
||||
// [ 768, N]
|
||||
@ -707,7 +708,7 @@ bool starcoder_eval(
|
||||
// norm
|
||||
{
|
||||
// [ 768, N]
|
||||
inpL = ggml_norm(ctx0, inpL);
|
||||
inpL = ggml_norm(ctx0, inpL, 1e-5f);
|
||||
|
||||
// inpL = ln_f_g*inpL + ln_f_b
|
||||
// [ 768, N]
|
||||
@ -1003,7 +1004,8 @@ DLL_EXPORT const char *get_build_variant() {
|
||||
return GGML_BUILD_VARIANT;
|
||||
}
|
||||
|
||||
DLL_EXPORT bool magic_match(std::istream& f) {
|
||||
DLL_EXPORT bool magic_match(const char *fname) {
|
||||
#if 0
|
||||
uint32_t magic = 0;
|
||||
f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
|
||||
if (magic != STARCODER_MAGIC) {
|
||||
@ -1015,6 +1017,8 @@ DLL_EXPORT bool magic_match(std::istream& f) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
DLL_EXPORT LLModel *construct() {
|
||||
|
@ -356,10 +356,10 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
|
||||
emit modelLoadingError(QString("Could not find file for model %1").arg(modelInfo.filename()));
|
||||
}
|
||||
|
||||
if (m_llModelInfo.model)
|
||||
if (m_llModelInfo.model) {
|
||||
setModelInfo(modelInfo);
|
||||
|
||||
processSystemPrompt();
|
||||
processSystemPrompt();
|
||||
}
|
||||
return m_llModelInfo.model;
|
||||
}
|
||||
|
||||
|
@ -189,7 +189,7 @@ Window {
|
||||
+ "causes include a bad file format, an incomplete or corrupted download, the wrong file "
|
||||
+ "type, not enough system RAM or an incompatible model type. Here are some suggestions for resolving the problem:"
|
||||
+ "<br><ul>"
|
||||
+ "<li>Ensure the model file has a compatible ggml format and type"
|
||||
+ "<li>Ensure the model file has a compatible format and type"
|
||||
+ "<li>Check the model file is complete in the download folder"
|
||||
+ "<li>You can find the download folder in the settings dialog"
|
||||
+ "<li>If you've sideloaded the model ensure the file is not corrupt by checking md5sum"
|
||||
|
@ -796,7 +796,7 @@ void ModelList::updateModelsFromDirectory()
|
||||
QString filename = it.fileName();
|
||||
|
||||
// All files that end with .bin and have 'ggml' somewhere in the name
|
||||
if ((filename.endsWith(".bin") && filename.contains("ggml") && !filename.startsWith("incomplete"))
|
||||
if (((filename.endsWith(".bin") || filename.endsWith(".gguf")) && (/*filename.contains("ggml") ||*/ filename.contains("gguf")) && !filename.startsWith("incomplete"))
|
||||
|| (filename.endsWith(".txt") && filename.startsWith("chatgpt-"))) {
|
||||
|
||||
QString filePath = it.filePath();
|
||||
|
Loading…
Reference in New Issue
Block a user