mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2024-10-01 01:06:10 -04:00
llama on Metal (#885)
Support latest llama with Metal --------- Co-authored-by: Adam Treat <adam@nomic.ai> Co-authored-by: niansa/tuxifan <tuxifan@posteo.de>
This commit is contained in:
parent
27e5602d5d
commit
b59ce1c6e7
2
.gitmodules
vendored
2
.gitmodules
vendored
@ -6,4 +6,4 @@
|
|||||||
url = https://github.com/manyoso/llama.cpp.git
|
url = https://github.com/manyoso/llama.cpp.git
|
||||||
[submodule "llama.cpp-mainline"]
|
[submodule "llama.cpp-mainline"]
|
||||||
path = gpt4all-backend/llama.cpp-mainline
|
path = gpt4all-backend/llama.cpp-mainline
|
||||||
url = https://github.com/ggerganov/llama.cpp.git
|
url = https://github.com/nomic-ai/llama.cpp.git
|
||||||
|
@ -39,6 +39,9 @@ endif()
|
|||||||
include(llama.cpp.cmake)
|
include(llama.cpp.cmake)
|
||||||
|
|
||||||
set(BUILD_VARIANTS default avxonly)
|
set(BUILD_VARIANTS default avxonly)
|
||||||
|
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
|
||||||
|
set(BUILD_VARIANTS ${BUILD_VARIANTS} metal)
|
||||||
|
endif()
|
||||||
|
|
||||||
set(CMAKE_VERBOSE_MAKEFILE ON)
|
set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||||
|
|
||||||
@ -54,10 +57,20 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
|||||||
set(LLAMA_F16C ${GPT4ALL_ALLOW_NON_AVX})
|
set(LLAMA_F16C ${GPT4ALL_ALLOW_NON_AVX})
|
||||||
set(LLAMA_FMA ${GPT4ALL_ALLOW_NON_AVX})
|
set(LLAMA_FMA ${GPT4ALL_ALLOW_NON_AVX})
|
||||||
|
|
||||||
|
if (BUILD_VARIANT STREQUAL metal)
|
||||||
|
set(LLAMA_K_QUANTS YES)
|
||||||
|
set(LLAMA_METAL YES)
|
||||||
|
else()
|
||||||
|
set(LLAMA_K_QUANTS NO)
|
||||||
|
set(LLAMA_METAL NO)
|
||||||
|
endif()
|
||||||
|
|
||||||
# Include GGML
|
# Include GGML
|
||||||
include_ggml(llama.cpp-mainline -mainline-${BUILD_VARIANT} ON)
|
include_ggml(llama.cpp-mainline -mainline-${BUILD_VARIANT} ON)
|
||||||
include_ggml(llama.cpp-230511 -230511-${BUILD_VARIANT} ON)
|
if (NOT LLAMA_METAL)
|
||||||
include_ggml(llama.cpp-230519 -230519-${BUILD_VARIANT} ON)
|
include_ggml(llama.cpp-230511 -230511-${BUILD_VARIANT} ON)
|
||||||
|
include_ggml(llama.cpp-230519 -230519-${BUILD_VARIANT} ON)
|
||||||
|
endif()
|
||||||
|
|
||||||
# Function for preparing individual implementations
|
# Function for preparing individual implementations
|
||||||
function(prepare_target TARGET_NAME BASE_LIB)
|
function(prepare_target TARGET_NAME BASE_LIB)
|
||||||
@ -82,29 +95,30 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
|||||||
LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
|
LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
|
||||||
prepare_target(llamamodel-mainline llama-mainline)
|
prepare_target(llamamodel-mainline llama-mainline)
|
||||||
|
|
||||||
add_library(llamamodel-230519-${BUILD_VARIANT} SHARED
|
if (NOT LLAMA_METAL)
|
||||||
llamamodel.cpp llmodel_shared.cpp)
|
add_library(llamamodel-230519-${BUILD_VARIANT} SHARED
|
||||||
target_compile_definitions(llamamodel-230519-${BUILD_VARIANT} PRIVATE
|
llamamodel.cpp llmodel_shared.cpp)
|
||||||
LLAMA_VERSIONS===2 LLAMA_DATE=230519)
|
target_compile_definitions(llamamodel-230519-${BUILD_VARIANT} PRIVATE
|
||||||
prepare_target(llamamodel-230519 llama-230519)
|
LLAMA_VERSIONS===2 LLAMA_DATE=230519)
|
||||||
|
prepare_target(llamamodel-230519 llama-230519)
|
||||||
|
add_library(llamamodel-230511-${BUILD_VARIANT} SHARED
|
||||||
|
llamamodel.cpp llmodel_shared.cpp)
|
||||||
|
target_compile_definitions(llamamodel-230511-${BUILD_VARIANT} PRIVATE
|
||||||
|
LLAMA_VERSIONS=<=1 LLAMA_DATE=230511)
|
||||||
|
prepare_target(llamamodel-230511 llama-230511)
|
||||||
|
|
||||||
add_library(llamamodel-230511-${BUILD_VARIANT} SHARED
|
add_library(gptj-${BUILD_VARIANT} SHARED
|
||||||
llamamodel.cpp llmodel_shared.cpp)
|
gptj.cpp utils.h utils.cpp llmodel_shared.cpp)
|
||||||
target_compile_definitions(llamamodel-230511-${BUILD_VARIANT} PRIVATE
|
prepare_target(gptj ggml-230511)
|
||||||
LLAMA_VERSIONS=<=1 LLAMA_DATE=230511)
|
|
||||||
prepare_target(llamamodel-230511 llama-230511)
|
|
||||||
|
|
||||||
add_library(gptj-${BUILD_VARIANT} SHARED
|
add_library(mpt-${BUILD_VARIANT} SHARED
|
||||||
gptj.cpp utils.h utils.cpp llmodel_shared.cpp)
|
mpt.cpp utils.h utils.cpp llmodel_shared.cpp)
|
||||||
prepare_target(gptj ggml-230511)
|
prepare_target(mpt ggml-230511)
|
||||||
|
|
||||||
add_library(mpt-${BUILD_VARIANT} SHARED
|
add_library(replit-${BUILD_VARIANT} SHARED
|
||||||
mpt.cpp utils.h utils.cpp llmodel_shared.cpp)
|
replit.cpp utils.h utils.cpp llmodel_shared.cpp)
|
||||||
prepare_target(mpt ggml-230511)
|
prepare_target(replit ggml-230511)
|
||||||
|
endif()
|
||||||
add_library(replit-${BUILD_VARIANT} SHARED
|
|
||||||
replit.cpp utils.h utils.cpp llmodel_shared.cpp)
|
|
||||||
prepare_target(replit ggml-230511)
|
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
add_library(llmodel
|
add_library(llmodel
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit 5b57a5b72676540b6a45a3f527126299969ad241
|
Subproject commit b33dee282f5d8032b5f780152732dc45cbf2d349
|
@ -34,6 +34,7 @@ endif()
|
|||||||
#
|
#
|
||||||
# Option list
|
# Option list
|
||||||
#
|
#
|
||||||
|
# some of the options here are commented out so they can be set "dynamically" before calling include_ggml()
|
||||||
|
|
||||||
# general
|
# general
|
||||||
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
||||||
@ -68,6 +69,7 @@ option(LLAMA_OPENBLAS "llama: use OpenBLAS"
|
|||||||
#option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
|
#option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
|
||||||
#option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
#option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||||
#option(LLAMA_METAL "llama: use Metal" OFF)
|
#option(LLAMA_METAL "llama: use Metal" OFF)
|
||||||
|
#option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
||||||
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||||
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
|
||||||
@ -263,10 +265,32 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(GGML_SOURCES_QUANT_K )
|
set(GGML_SOURCES_QUANT_K )
|
||||||
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}/ggml-quants-k.h)
|
set(GGML_METAL_SOURCES )
|
||||||
|
if (LLAMA_K_QUANTS)
|
||||||
set(GGML_SOURCES_QUANT_K
|
set(GGML_SOURCES_QUANT_K
|
||||||
${DIRECTORY}/ggml-quants-k.h
|
${DIRECTORY}/k_quants.h
|
||||||
${DIRECTORY}/ggml-quants-k.c)
|
${DIRECTORY}/k_quants.c)
|
||||||
|
|
||||||
|
if (LLAMA_METAL)
|
||||||
|
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||||
|
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||||
|
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||||
|
find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
|
||||||
|
|
||||||
|
set(GGML_METAL_SOURCES ${DIRECTORY}/ggml-metal.m ${DIRECTORY}/ggml-metal.h)
|
||||||
|
# get full path to the file
|
||||||
|
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
|
||||||
|
|
||||||
|
# copy ggml-metal.metal to bin directory
|
||||||
|
configure_file(${DIRECTORY}/ggml-metal.metal bin/ggml-metal.metal COPYONLY)
|
||||||
|
|
||||||
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
|
||||||
|
${FOUNDATION_LIBRARY}
|
||||||
|
${METAL_FRAMEWORK}
|
||||||
|
${METALKIT_FRAMEWORK}
|
||||||
|
${METALPERFORMANCE_FRAMEWORK}
|
||||||
|
)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_library(ggml${SUFFIX} OBJECT
|
add_library(ggml${SUFFIX} OBJECT
|
||||||
@ -274,8 +298,16 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
|
|||||||
${DIRECTORY}/ggml.h
|
${DIRECTORY}/ggml.h
|
||||||
${GGML_SOURCES_QUANT_K}
|
${GGML_SOURCES_QUANT_K}
|
||||||
${GGML_SOURCES_CUDA}
|
${GGML_SOURCES_CUDA}
|
||||||
|
${GGML_METAL_SOURCES}
|
||||||
${GGML_OPENCL_SOURCES})
|
${GGML_OPENCL_SOURCES})
|
||||||
|
|
||||||
|
if (LLAMA_K_QUANTS)
|
||||||
|
target_compile_definitions(ggml${SUFFIX} PUBLIC GGML_USE_K_QUANTS)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_METAL AND GGML_METAL_SOURCES)
|
||||||
|
target_compile_definitions(ggml${SUFFIX} PUBLIC GGML_USE_METAL GGML_METAL_NDEBUG)
|
||||||
|
endif()
|
||||||
target_include_directories(ggml${SUFFIX} PUBLIC ${DIRECTORY})
|
target_include_directories(ggml${SUFFIX} PUBLIC ${DIRECTORY})
|
||||||
target_compile_features(ggml${SUFFIX} PUBLIC c_std_11) # don't bump
|
target_compile_features(ggml${SUFFIX} PUBLIC c_std_11) # don't bump
|
||||||
|
|
||||||
@ -295,6 +327,9 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
|
|||||||
${DIRECTORY}/llama.h
|
${DIRECTORY}/llama.h
|
||||||
${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE})
|
${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE})
|
||||||
|
|
||||||
|
if (LLAMA_METAL AND GGML_METAL_SOURCES)
|
||||||
|
target_compile_definitions(llama${SUFFIX} PUBLIC GGML_USE_METAL GGML_METAL_NDEBUG)
|
||||||
|
endif()
|
||||||
target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY})
|
target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY})
|
||||||
target_compile_features(llama${SUFFIX} PUBLIC cxx_std_11) # don't bump
|
target_compile_features(llama${SUFFIX} PUBLIC cxx_std_11) # don't bump
|
||||||
|
|
||||||
@ -332,32 +367,6 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
|
|||||||
target_compile_definitions(ggml${SUFFIX} PRIVATE GGML_USE_CLBLAST)
|
target_compile_definitions(ggml${SUFFIX} PRIVATE GGML_USE_CLBLAST)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_METAL)
|
|
||||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
|
||||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
|
||||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
|
||||||
find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
|
|
||||||
|
|
||||||
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
|
|
||||||
|
|
||||||
target_compile_definitions(llama${SUFFIX} PRIVATE
|
|
||||||
GGML_USE_METAL
|
|
||||||
GGML_METAL_NDEBUG)
|
|
||||||
|
|
||||||
# get full path to the file
|
|
||||||
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
|
|
||||||
|
|
||||||
# copy ggml-metal.metal to bin directory
|
|
||||||
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
|
|
||||||
|
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
|
|
||||||
${FOUNDATION_LIBRARY}
|
|
||||||
${METAL_FRAMEWORK}
|
|
||||||
${METALKIT_FRAMEWORK}
|
|
||||||
${METALPERFORMANCE_FRAMEWORK}
|
|
||||||
)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
|
||||||
message(STATUS "ARM detected")
|
message(STATUS "ARM detected")
|
||||||
if (MSVC)
|
if (MSVC)
|
||||||
|
@ -115,6 +115,12 @@ bool LLamaModel::loadModel(const std::string &modelPath)
|
|||||||
#if LLAMA_DATE <= 230511
|
#if LLAMA_DATE <= 230511
|
||||||
d_ptr->params.n_parts = params.n_parts;
|
d_ptr->params.n_parts = params.n_parts;
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
std::cerr << "llama.cpp: using Metal" << std::endl;
|
||||||
|
// metal always runs the whole model if n_gpu_layers is not 0, at least
|
||||||
|
// currently
|
||||||
|
d_ptr->params.n_gpu_layers = 1;
|
||||||
|
#endif
|
||||||
|
|
||||||
d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params);
|
d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params);
|
||||||
if (!d_ptr->ctx) {
|
if (!d_ptr->ctx) {
|
||||||
@ -228,7 +234,30 @@ DLL_EXPORT bool magic_match(std::istream& f) {
|
|||||||
// Check version
|
// Check version
|
||||||
uint32_t version = 0;
|
uint32_t version = 0;
|
||||||
f.read(reinterpret_cast<char*>(&version), sizeof(version));
|
f.read(reinterpret_cast<char*>(&version), sizeof(version));
|
||||||
return version LLAMA_VERSIONS;
|
if (!(version LLAMA_VERSIONS)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
// Check quant supported on metal
|
||||||
|
// skip fields
|
||||||
|
off_t offset = sizeof(uint32_t) * 6; // n_vocab, n_embd, n_mult, n_head, n_layer, n_rot
|
||||||
|
f.seekg(offset, std::ios_base::cur);
|
||||||
|
uint32_t ftype;
|
||||||
|
f.read(reinterpret_cast<char*>(&ftype), sizeof(ftype)); // ftype
|
||||||
|
switch((enum llama_ftype) ftype) {
|
||||||
|
// currently supported on Metal https://github.com/ggerganov/llama.cpp/blob/ae9663f1887513e152839e91f61c513075a19422/ggml-metal.m#L51-L55
|
||||||
|
case LLAMA_FTYPE_MOSTLY_F16:
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q2_K:
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_0:
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q6_K:
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_K_M:
|
||||||
|
return true;
|
||||||
|
default: // unsupported quant-type for Metal
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
DLL_EXPORT LLModel *construct() {
|
DLL_EXPORT LLModel *construct() {
|
||||||
|
@ -121,20 +121,30 @@ LLModel *LLModel::construct(const std::string &modelPath, std::string buildVaria
|
|||||||
if (!has_at_least_minimal_hardware())
|
if (!has_at_least_minimal_hardware())
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
||||||
//TODO: Auto-detect CUDA/OpenCL
|
|
||||||
if (buildVariant == "auto") {
|
|
||||||
if (requires_avxonly()) {
|
|
||||||
buildVariant = "avxonly";
|
|
||||||
} else {
|
|
||||||
buildVariant = "default";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Read magic
|
// Read magic
|
||||||
std::ifstream f(modelPath, std::ios::binary);
|
std::ifstream f(modelPath, std::ios::binary);
|
||||||
if (!f) return nullptr;
|
if (!f) return nullptr;
|
||||||
// Get correct implementation
|
// Get correct implementation
|
||||||
auto impl = implementation(f, buildVariant);
|
const LLModel::Implementation* impl = nullptr;
|
||||||
if (!impl) return nullptr;
|
|
||||||
|
#if defined(__APPLE__) && defined(__arm64__) // FIXME: See if metal works for intel macs
|
||||||
|
if (buildVariant == "auto") {
|
||||||
|
impl = implementation(f, "metal");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (!impl) {
|
||||||
|
//TODO: Auto-detect CUDA/OpenCL
|
||||||
|
if (buildVariant == "auto") {
|
||||||
|
if (requires_avxonly()) {
|
||||||
|
buildVariant = "avxonly";
|
||||||
|
} else {
|
||||||
|
buildVariant = "default";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl = implementation(f, buildVariant);
|
||||||
|
if (!impl) return nullptr;
|
||||||
|
}
|
||||||
f.close();
|
f.close();
|
||||||
// Construct and return llmodel implementation
|
// Construct and return llmodel implementation
|
||||||
return impl->construct();
|
return impl->construct();
|
||||||
|
@ -58,6 +58,11 @@ set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
|||||||
|
|
||||||
add_subdirectory(../gpt4all-backend llmodel)
|
add_subdirectory(../gpt4all-backend llmodel)
|
||||||
|
|
||||||
|
set(METAL_SHADER_FILE)
|
||||||
|
if(${CMAKE_SYSTEM_NAME} MATCHES Darwin)
|
||||||
|
set(METAL_SHADER_FILE ../gpt4all-backend/llama.cpp-mainline/ggml-metal.metal)
|
||||||
|
endif()
|
||||||
|
|
||||||
qt_add_executable(chat
|
qt_add_executable(chat
|
||||||
main.cpp
|
main.cpp
|
||||||
chat.h chat.cpp
|
chat.h chat.cpp
|
||||||
@ -72,6 +77,7 @@ qt_add_executable(chat
|
|||||||
server.h server.cpp
|
server.h server.cpp
|
||||||
logger.h logger.cpp
|
logger.h logger.cpp
|
||||||
sysinfo.h
|
sysinfo.h
|
||||||
|
${METAL_SHADER_FILE}
|
||||||
)
|
)
|
||||||
|
|
||||||
qt_add_qml_module(chat
|
qt_add_qml_module(chat
|
||||||
@ -132,6 +138,12 @@ if(${CMAKE_SYSTEM_NAME} MATCHES Darwin)
|
|||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(METAL_SHADER_FILE)
|
||||||
|
set_target_properties(chat PROPERTIES
|
||||||
|
RESOURCE ${METAL_SHADER_FILE}
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
target_compile_definitions(chat
|
target_compile_definitions(chat
|
||||||
PRIVATE $<$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>:QT_QML_DEBUG>)
|
PRIVATE $<$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>:QT_QML_DEBUG>)
|
||||||
target_link_libraries(chat
|
target_link_libraries(chat
|
||||||
|
Loading…
Reference in New Issue
Block a user