diff --git a/.gitmodules b/.gitmodules index 50de0692..74bf3c97 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,4 +6,4 @@ url = https://github.com/manyoso/llama.cpp.git [submodule "llama.cpp-mainline"] path = gpt4all-backend/llama.cpp-mainline - url = https://github.com/ggerganov/llama.cpp.git + url = https://github.com/nomic-ai/llama.cpp.git diff --git a/gpt4all-backend/CMakeLists.txt b/gpt4all-backend/CMakeLists.txt index ae33ad70..aab1e98d 100644 --- a/gpt4all-backend/CMakeLists.txt +++ b/gpt4all-backend/CMakeLists.txt @@ -39,6 +39,9 @@ endif() include(llama.cpp.cmake) set(BUILD_VARIANTS default avxonly) +if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + set(BUILD_VARIANTS ${BUILD_VARIANTS} metal) +endif() set(CMAKE_VERBOSE_MAKEFILE ON) @@ -54,10 +57,20 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS) set(LLAMA_F16C ${GPT4ALL_ALLOW_NON_AVX}) set(LLAMA_FMA ${GPT4ALL_ALLOW_NON_AVX}) + if (BUILD_VARIANT STREQUAL metal) + set(LLAMA_K_QUANTS YES) + set(LLAMA_METAL YES) + else() + set(LLAMA_K_QUANTS NO) + set(LLAMA_METAL NO) + endif() + # Include GGML include_ggml(llama.cpp-mainline -mainline-${BUILD_VARIANT} ON) - include_ggml(llama.cpp-230511 -230511-${BUILD_VARIANT} ON) - include_ggml(llama.cpp-230519 -230519-${BUILD_VARIANT} ON) + if (NOT LLAMA_METAL) + include_ggml(llama.cpp-230511 -230511-${BUILD_VARIANT} ON) + include_ggml(llama.cpp-230519 -230519-${BUILD_VARIANT} ON) + endif() # Function for preparing individual implementations function(prepare_target TARGET_NAME BASE_LIB) @@ -82,29 +95,30 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS) LLAMA_VERSIONS=>=3 LLAMA_DATE=999999) prepare_target(llamamodel-mainline llama-mainline) - add_library(llamamodel-230519-${BUILD_VARIANT} SHARED - llamamodel.cpp llmodel_shared.cpp) - target_compile_definitions(llamamodel-230519-${BUILD_VARIANT} PRIVATE - LLAMA_VERSIONS===2 LLAMA_DATE=230519) - prepare_target(llamamodel-230519 llama-230519) + if (NOT LLAMA_METAL) + add_library(llamamodel-230519-${BUILD_VARIANT} SHARED + llamamodel.cpp llmodel_shared.cpp) + target_compile_definitions(llamamodel-230519-${BUILD_VARIANT} PRIVATE + LLAMA_VERSIONS===2 LLAMA_DATE=230519) + prepare_target(llamamodel-230519 llama-230519) + add_library(llamamodel-230511-${BUILD_VARIANT} SHARED + llamamodel.cpp llmodel_shared.cpp) + target_compile_definitions(llamamodel-230511-${BUILD_VARIANT} PRIVATE + LLAMA_VERSIONS=<=1 LLAMA_DATE=230511) + prepare_target(llamamodel-230511 llama-230511) - add_library(llamamodel-230511-${BUILD_VARIANT} SHARED - llamamodel.cpp llmodel_shared.cpp) - target_compile_definitions(llamamodel-230511-${BUILD_VARIANT} PRIVATE - LLAMA_VERSIONS=<=1 LLAMA_DATE=230511) - prepare_target(llamamodel-230511 llama-230511) + add_library(gptj-${BUILD_VARIANT} SHARED + gptj.cpp utils.h utils.cpp llmodel_shared.cpp) + prepare_target(gptj ggml-230511) - add_library(gptj-${BUILD_VARIANT} SHARED - gptj.cpp utils.h utils.cpp llmodel_shared.cpp) - prepare_target(gptj ggml-230511) + add_library(mpt-${BUILD_VARIANT} SHARED + mpt.cpp utils.h utils.cpp llmodel_shared.cpp) + prepare_target(mpt ggml-230511) - add_library(mpt-${BUILD_VARIANT} SHARED - mpt.cpp utils.h utils.cpp llmodel_shared.cpp) - prepare_target(mpt ggml-230511) - - add_library(replit-${BUILD_VARIANT} SHARED - replit.cpp utils.h utils.cpp llmodel_shared.cpp) - prepare_target(replit ggml-230511) + add_library(replit-${BUILD_VARIANT} SHARED + replit.cpp utils.h utils.cpp llmodel_shared.cpp) + prepare_target(replit ggml-230511) + endif() endforeach() add_library(llmodel diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp-mainline index 5b57a5b7..b33dee28 160000 --- a/gpt4all-backend/llama.cpp-mainline +++ b/gpt4all-backend/llama.cpp-mainline @@ -1 +1 @@ -Subproject commit 5b57a5b72676540b6a45a3f527126299969ad241 +Subproject commit b33dee282f5d8032b5f780152732dc45cbf2d349 diff --git a/gpt4all-backend/llama.cpp.cmake b/gpt4all-backend/llama.cpp.cmake index 01ded39d..c3dbf01a 100644 --- a/gpt4all-backend/llama.cpp.cmake +++ b/gpt4all-backend/llama.cpp.cmake @@ -34,6 +34,7 @@ endif() # # Option list # +# some of the options here are commented out so they can be set "dynamically" before calling include_ggml() # general option(LLAMA_STATIC "llama: static link libraries" OFF) @@ -68,6 +69,7 @@ option(LLAMA_OPENBLAS "llama: use OpenBLAS" #option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) #option(LLAMA_CLBLAST "llama: use CLBlast" OFF) #option(LLAMA_METAL "llama: use Metal" OFF) +#option(LLAMA_K_QUANTS "llama: use k-quants" ON) set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") @@ -263,10 +265,32 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA) endif() set(GGML_SOURCES_QUANT_K ) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}/ggml-quants-k.h) + set(GGML_METAL_SOURCES ) + if (LLAMA_K_QUANTS) set(GGML_SOURCES_QUANT_K - ${DIRECTORY}/ggml-quants-k.h - ${DIRECTORY}/ggml-quants-k.c) + ${DIRECTORY}/k_quants.h + ${DIRECTORY}/k_quants.c) + + if (LLAMA_METAL) + find_library(FOUNDATION_LIBRARY Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) + find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED) + + set(GGML_METAL_SOURCES ${DIRECTORY}/ggml-metal.m ${DIRECTORY}/ggml-metal.h) + # get full path to the file + #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") + + # copy ggml-metal.metal to bin directory + configure_file(${DIRECTORY}/ggml-metal.metal bin/ggml-metal.metal COPYONLY) + + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} + ${FOUNDATION_LIBRARY} + ${METAL_FRAMEWORK} + ${METALKIT_FRAMEWORK} + ${METALPERFORMANCE_FRAMEWORK} + ) + endif() endif() add_library(ggml${SUFFIX} OBJECT @@ -274,8 +298,16 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA) ${DIRECTORY}/ggml.h ${GGML_SOURCES_QUANT_K} ${GGML_SOURCES_CUDA} + ${GGML_METAL_SOURCES} ${GGML_OPENCL_SOURCES}) + if (LLAMA_K_QUANTS) + target_compile_definitions(ggml${SUFFIX} PUBLIC GGML_USE_K_QUANTS) + endif() + + if (LLAMA_METAL AND GGML_METAL_SOURCES) + target_compile_definitions(ggml${SUFFIX} PUBLIC GGML_USE_METAL GGML_METAL_NDEBUG) + endif() target_include_directories(ggml${SUFFIX} PUBLIC ${DIRECTORY}) target_compile_features(ggml${SUFFIX} PUBLIC c_std_11) # don't bump @@ -295,6 +327,9 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA) ${DIRECTORY}/llama.h ${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE}) + if (LLAMA_METAL AND GGML_METAL_SOURCES) + target_compile_definitions(llama${SUFFIX} PUBLIC GGML_USE_METAL GGML_METAL_NDEBUG) + endif() target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY}) target_compile_features(llama${SUFFIX} PUBLIC cxx_std_11) # don't bump @@ -332,32 +367,6 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA) target_compile_definitions(ggml${SUFFIX} PRIVATE GGML_USE_CLBLAST) endif() - if (LLAMA_METAL) - find_library(FOUNDATION_LIBRARY Foundation REQUIRED) - find_library(METAL_FRAMEWORK Metal REQUIRED) - find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) - find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED) - - set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h) - - target_compile_definitions(llama${SUFFIX} PRIVATE - GGML_USE_METAL - GGML_METAL_NDEBUG) - - # get full path to the file - #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") - - # copy ggml-metal.metal to bin directory - configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) - - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} - ${FOUNDATION_LIBRARY} - ${METAL_FRAMEWORK} - ${METALKIT_FRAMEWORK} - ${METALPERFORMANCE_FRAMEWORK} - ) - endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") message(STATUS "ARM detected") if (MSVC) diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp index 66aacac4..17b55855 100644 --- a/gpt4all-backend/llamamodel.cpp +++ b/gpt4all-backend/llamamodel.cpp @@ -115,6 +115,12 @@ bool LLamaModel::loadModel(const std::string &modelPath) #if LLAMA_DATE <= 230511 d_ptr->params.n_parts = params.n_parts; #endif +#ifdef GGML_USE_METAL + std::cerr << "llama.cpp: using Metal" << std::endl; + // metal always runs the whole model if n_gpu_layers is not 0, at least + // currently + d_ptr->params.n_gpu_layers = 1; +#endif d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params); if (!d_ptr->ctx) { @@ -228,7 +234,30 @@ DLL_EXPORT bool magic_match(std::istream& f) { // Check version uint32_t version = 0; f.read(reinterpret_cast(&version), sizeof(version)); - return version LLAMA_VERSIONS; + if (!(version LLAMA_VERSIONS)) { + return false; + } +#ifdef GGML_USE_METAL + // Check quant supported on metal + // skip fields + off_t offset = sizeof(uint32_t) * 6; // n_vocab, n_embd, n_mult, n_head, n_layer, n_rot + f.seekg(offset, std::ios_base::cur); + uint32_t ftype; + f.read(reinterpret_cast(&ftype), sizeof(ftype)); // ftype + switch((enum llama_ftype) ftype) { + // currently supported on Metal https://github.com/ggerganov/llama.cpp/blob/ae9663f1887513e152839e91f61c513075a19422/ggml-metal.m#L51-L55 + case LLAMA_FTYPE_MOSTLY_F16: + case LLAMA_FTYPE_MOSTLY_Q2_K: + case LLAMA_FTYPE_MOSTLY_Q4_0: + case LLAMA_FTYPE_MOSTLY_Q6_K: + case LLAMA_FTYPE_MOSTLY_Q4_K_S: + case LLAMA_FTYPE_MOSTLY_Q4_K_M: + return true; + default: // unsupported quant-type for Metal + return false; + } +#endif + return true; } DLL_EXPORT LLModel *construct() { diff --git a/gpt4all-backend/llmodel.cpp b/gpt4all-backend/llmodel.cpp index 7499a75b..3563f2c5 100644 --- a/gpt4all-backend/llmodel.cpp +++ b/gpt4all-backend/llmodel.cpp @@ -121,20 +121,30 @@ LLModel *LLModel::construct(const std::string &modelPath, std::string buildVaria if (!has_at_least_minimal_hardware()) return nullptr; - //TODO: Auto-detect CUDA/OpenCL - if (buildVariant == "auto") { - if (requires_avxonly()) { - buildVariant = "avxonly"; - } else { - buildVariant = "default"; - } - } // Read magic std::ifstream f(modelPath, std::ios::binary); if (!f) return nullptr; // Get correct implementation - auto impl = implementation(f, buildVariant); - if (!impl) return nullptr; + const LLModel::Implementation* impl = nullptr; + + #if defined(__APPLE__) && defined(__arm64__) // FIXME: See if metal works for intel macs + if (buildVariant == "auto") { + impl = implementation(f, "metal"); + } + #endif + + if (!impl) { + //TODO: Auto-detect CUDA/OpenCL + if (buildVariant == "auto") { + if (requires_avxonly()) { + buildVariant = "avxonly"; + } else { + buildVariant = "default"; + } + } + impl = implementation(f, buildVariant); + if (!impl) return nullptr; + } f.close(); // Construct and return llmodel implementation return impl->construct(); diff --git a/gpt4all-chat/CMakeLists.txt b/gpt4all-chat/CMakeLists.txt index a5266fa9..17b0f46f 100644 --- a/gpt4all-chat/CMakeLists.txt +++ b/gpt4all-chat/CMakeLists.txt @@ -58,6 +58,11 @@ set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) add_subdirectory(../gpt4all-backend llmodel) +set(METAL_SHADER_FILE) +if(${CMAKE_SYSTEM_NAME} MATCHES Darwin) + set(METAL_SHADER_FILE ../gpt4all-backend/llama.cpp-mainline/ggml-metal.metal) +endif() + qt_add_executable(chat main.cpp chat.h chat.cpp @@ -72,6 +77,7 @@ qt_add_executable(chat server.h server.cpp logger.h logger.cpp sysinfo.h + ${METAL_SHADER_FILE} ) qt_add_qml_module(chat @@ -132,6 +138,12 @@ if(${CMAKE_SYSTEM_NAME} MATCHES Darwin) ) endif() +if(METAL_SHADER_FILE) + set_target_properties(chat PROPERTIES + RESOURCE ${METAL_SHADER_FILE} + ) +endif() + target_compile_definitions(chat PRIVATE $<$,$>:QT_QML_DEBUG>) target_link_libraries(chat