Revert "llama on Metal (#885)"

This reverts commit c55f81b860.
This commit is contained in:
Adam Treat 2023-06-09 15:08:46 -04:00
parent c55f81b860
commit b162b5c64e
7 changed files with 64 additions and 138 deletions

2
.gitmodules vendored
View File

@ -6,4 +6,4 @@
url = https://github.com/manyoso/llama.cpp.git url = https://github.com/manyoso/llama.cpp.git
[submodule "llama.cpp-mainline"] [submodule "llama.cpp-mainline"]
path = gpt4all-backend/llama.cpp-mainline path = gpt4all-backend/llama.cpp-mainline
url = https://github.com/nomic-ai/llama.cpp.git url = https://github.com/ggerganov/llama.cpp.git

View File

@ -39,9 +39,6 @@ endif()
include(llama.cpp.cmake) include(llama.cpp.cmake)
set(BUILD_VARIANTS default avxonly) set(BUILD_VARIANTS default avxonly)
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(BUILD_VARIANTS ${BUILD_VARIANTS} metal)
endif()
set(CMAKE_VERBOSE_MAKEFILE ON) set(CMAKE_VERBOSE_MAKEFILE ON)
@ -57,20 +54,10 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
set(LLAMA_F16C ${GPT4ALL_ALLOW_NON_AVX}) set(LLAMA_F16C ${GPT4ALL_ALLOW_NON_AVX})
set(LLAMA_FMA ${GPT4ALL_ALLOW_NON_AVX}) set(LLAMA_FMA ${GPT4ALL_ALLOW_NON_AVX})
if (BUILD_VARIANT STREQUAL metal)
set(LLAMA_K_QUANTS YES)
set(LLAMA_METAL YES)
else()
set(LLAMA_K_QUANTS NO)
set(LLAMA_METAL NO)
endif()
# Include GGML # Include GGML
include_ggml(llama.cpp-mainline -mainline-${BUILD_VARIANT} ON) include_ggml(llama.cpp-mainline -mainline-${BUILD_VARIANT} ON)
if (NOT LLAMA_METAL) include_ggml(llama.cpp-230511 -230511-${BUILD_VARIANT} ON)
include_ggml(llama.cpp-230511 -230511-${BUILD_VARIANT} ON) include_ggml(llama.cpp-230519 -230519-${BUILD_VARIANT} ON)
include_ggml(llama.cpp-230519 -230519-${BUILD_VARIANT} ON)
endif()
# Function for preparing individual implementations # Function for preparing individual implementations
function(prepare_target TARGET_NAME BASE_LIB) function(prepare_target TARGET_NAME BASE_LIB)
@ -95,30 +82,29 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
LLAMA_VERSIONS=>=3 LLAMA_DATE=999999) LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
prepare_target(llamamodel-mainline llama-mainline) prepare_target(llamamodel-mainline llama-mainline)
if (NOT LLAMA_METAL) add_library(llamamodel-230519-${BUILD_VARIANT} SHARED
add_library(llamamodel-230519-${BUILD_VARIANT} SHARED llamamodel.cpp llmodel_shared.cpp)
llamamodel.cpp llmodel_shared.cpp) target_compile_definitions(llamamodel-230519-${BUILD_VARIANT} PRIVATE
target_compile_definitions(llamamodel-230519-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS===2 LLAMA_DATE=230519)
LLAMA_VERSIONS===2 LLAMA_DATE=230519) prepare_target(llamamodel-230519 llama-230519)
prepare_target(llamamodel-230519 llama-230519)
add_library(llamamodel-230511-${BUILD_VARIANT} SHARED
llamamodel.cpp llmodel_shared.cpp)
target_compile_definitions(llamamodel-230511-${BUILD_VARIANT} PRIVATE
LLAMA_VERSIONS=<=1 LLAMA_DATE=230511)
prepare_target(llamamodel-230511 llama-230511)
add_library(gptj-${BUILD_VARIANT} SHARED add_library(llamamodel-230511-${BUILD_VARIANT} SHARED
gptj.cpp utils.h utils.cpp llmodel_shared.cpp) llamamodel.cpp llmodel_shared.cpp)
prepare_target(gptj ggml-230511) target_compile_definitions(llamamodel-230511-${BUILD_VARIANT} PRIVATE
LLAMA_VERSIONS=<=1 LLAMA_DATE=230511)
prepare_target(llamamodel-230511 llama-230511)
add_library(mpt-${BUILD_VARIANT} SHARED add_library(gptj-${BUILD_VARIANT} SHARED
mpt.cpp utils.h utils.cpp llmodel_shared.cpp) gptj.cpp utils.h utils.cpp llmodel_shared.cpp)
prepare_target(mpt ggml-230511) prepare_target(gptj ggml-230511)
add_library(replit-${BUILD_VARIANT} SHARED add_library(mpt-${BUILD_VARIANT} SHARED
replit.cpp utils.h utils.cpp llmodel_shared.cpp) mpt.cpp utils.h utils.cpp llmodel_shared.cpp)
prepare_target(replit ggml-230511) prepare_target(mpt ggml-230511)
endif()
add_library(replit-${BUILD_VARIANT} SHARED
replit.cpp utils.h utils.cpp llmodel_shared.cpp)
prepare_target(replit ggml-230511)
endforeach() endforeach()
add_library(llmodel add_library(llmodel

@ -1 +1 @@
Subproject commit b33dee282f5d8032b5f780152732dc45cbf2d349 Subproject commit 5b57a5b72676540b6a45a3f527126299969ad241

View File

@ -34,7 +34,6 @@ endif()
# #
# Option list # Option list
# #
# some of the options here are commented out so they can be set "dynamically" before calling include_ggml()
# general # general
option(LLAMA_STATIC "llama: static link libraries" OFF) option(LLAMA_STATIC "llama: static link libraries" OFF)
@ -69,7 +68,6 @@ option(LLAMA_OPENBLAS "llama: use OpenBLAS"
#option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) #option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
#option(LLAMA_CLBLAST "llama: use CLBlast" OFF) #option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
#option(LLAMA_METAL "llama: use Metal" OFF) #option(LLAMA_METAL "llama: use Metal" OFF)
#option(LLAMA_K_QUANTS "llama: use k-quants" ON)
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
@ -265,32 +263,10 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
endif() endif()
set(GGML_SOURCES_QUANT_K ) set(GGML_SOURCES_QUANT_K )
set(GGML_METAL_SOURCES ) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}/ggml-quants-k.h)
if (LLAMA_K_QUANTS)
set(GGML_SOURCES_QUANT_K set(GGML_SOURCES_QUANT_K
${DIRECTORY}/k_quants.h ${DIRECTORY}/ggml-quants-k.h
${DIRECTORY}/k_quants.c) ${DIRECTORY}/ggml-quants-k.c)
if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
set(GGML_METAL_SOURCES ${DIRECTORY}/ggml-metal.m ${DIRECTORY}/ggml-metal.h)
# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
# copy ggml-metal.metal to bin directory
configure_file(${DIRECTORY}/ggml-metal.metal bin/ggml-metal.metal COPYONLY)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
${METALPERFORMANCE_FRAMEWORK}
)
endif()
endif() endif()
add_library(ggml${SUFFIX} OBJECT add_library(ggml${SUFFIX} OBJECT
@ -298,16 +274,8 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
${DIRECTORY}/ggml.h ${DIRECTORY}/ggml.h
${GGML_SOURCES_QUANT_K} ${GGML_SOURCES_QUANT_K}
${GGML_SOURCES_CUDA} ${GGML_SOURCES_CUDA}
${GGML_METAL_SOURCES}
${GGML_OPENCL_SOURCES}) ${GGML_OPENCL_SOURCES})
if (LLAMA_K_QUANTS)
target_compile_definitions(ggml${SUFFIX} PUBLIC GGML_USE_K_QUANTS)
endif()
if (LLAMA_METAL AND GGML_METAL_SOURCES)
target_compile_definitions(ggml${SUFFIX} PUBLIC GGML_USE_METAL GGML_METAL_NDEBUG)
endif()
target_include_directories(ggml${SUFFIX} PUBLIC ${DIRECTORY}) target_include_directories(ggml${SUFFIX} PUBLIC ${DIRECTORY})
target_compile_features(ggml${SUFFIX} PUBLIC c_std_11) # don't bump target_compile_features(ggml${SUFFIX} PUBLIC c_std_11) # don't bump
@ -327,9 +295,6 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
${DIRECTORY}/llama.h ${DIRECTORY}/llama.h
${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE}) ${DIRECTORY}/${LLAMA_UTIL_SOURCE_FILE})
if (LLAMA_METAL AND GGML_METAL_SOURCES)
target_compile_definitions(llama${SUFFIX} PUBLIC GGML_USE_METAL GGML_METAL_NDEBUG)
endif()
target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY}) target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY})
target_compile_features(llama${SUFFIX} PUBLIC cxx_std_11) # don't bump target_compile_features(llama${SUFFIX} PUBLIC cxx_std_11) # don't bump
@ -367,6 +332,32 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
target_compile_definitions(ggml${SUFFIX} PRIVATE GGML_USE_CLBLAST) target_compile_definitions(ggml${SUFFIX} PRIVATE GGML_USE_CLBLAST)
endif() endif()
if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
target_compile_definitions(llama${SUFFIX} PRIVATE
GGML_USE_METAL
GGML_METAL_NDEBUG)
# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
${METALPERFORMANCE_FRAMEWORK}
)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
message(STATUS "ARM detected") message(STATUS "ARM detected")
if (MSVC) if (MSVC)

View File

@ -115,12 +115,6 @@ bool LLamaModel::loadModel(const std::string &modelPath)
#if LLAMA_DATE <= 230511 #if LLAMA_DATE <= 230511
d_ptr->params.n_parts = params.n_parts; d_ptr->params.n_parts = params.n_parts;
#endif #endif
#ifdef GGML_USE_METAL
std::cerr << "llama.cpp: using Metal" << std::endl;
// metal always runs the whole model if n_gpu_layers is not 0, at least
// currently
d_ptr->params.n_gpu_layers = 1;
#endif
d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params); d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params);
if (!d_ptr->ctx) { if (!d_ptr->ctx) {
@ -234,30 +228,7 @@ DLL_EXPORT bool magic_match(std::istream& f) {
// Check version // Check version
uint32_t version = 0; uint32_t version = 0;
f.read(reinterpret_cast<char*>(&version), sizeof(version)); f.read(reinterpret_cast<char*>(&version), sizeof(version));
if (!(version LLAMA_VERSIONS)) { return version LLAMA_VERSIONS;
return false;
}
#ifdef GGML_USE_METAL
// Check quant supported on metal
// skip fields
off_t offset = sizeof(uint32_t) * 6; // n_vocab, n_embd, n_mult, n_head, n_layer, n_rot
f.seekg(offset, std::ios_base::cur);
uint32_t ftype;
f.read(reinterpret_cast<char*>(&ftype), sizeof(ftype)); // ftype
switch((enum llama_ftype) ftype) {
// currently supported on Metal https://github.com/ggerganov/llama.cpp/blob/ae9663f1887513e152839e91f61c513075a19422/ggml-metal.m#L51-L55
case LLAMA_FTYPE_MOSTLY_F16:
case LLAMA_FTYPE_MOSTLY_Q2_K:
case LLAMA_FTYPE_MOSTLY_Q4_0:
case LLAMA_FTYPE_MOSTLY_Q6_K:
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
case LLAMA_FTYPE_MOSTLY_Q4_K_M:
return true;
default: // unsupported quant-type for Metal
return false;
}
#endif
return true;
} }
DLL_EXPORT LLModel *construct() { DLL_EXPORT LLModel *construct() {

View File

@ -121,30 +121,20 @@ LLModel *LLModel::construct(const std::string &modelPath, std::string buildVaria
if (!has_at_least_minimal_hardware()) if (!has_at_least_minimal_hardware())
return nullptr; return nullptr;
//TODO: Auto-detect CUDA/OpenCL
if (buildVariant == "auto") {
if (requires_avxonly()) {
buildVariant = "avxonly";
} else {
buildVariant = "default";
}
}
// Read magic // Read magic
std::ifstream f(modelPath, std::ios::binary); std::ifstream f(modelPath, std::ios::binary);
if (!f) return nullptr; if (!f) return nullptr;
// Get correct implementation // Get correct implementation
const LLModel::Implementation* impl = nullptr; auto impl = implementation(f, buildVariant);
if (!impl) return nullptr;
#if defined(__APPLE__) && defined(__arm64__) // FIXME: See if metal works for intel macs
if (buildVariant == "auto") {
impl = implementation(f, "metal");
}
#endif
if (!impl) {
//TODO: Auto-detect CUDA/OpenCL
if (buildVariant == "auto") {
if (requires_avxonly()) {
buildVariant = "avxonly";
} else {
buildVariant = "default";
}
}
impl = implementation(f, buildVariant);
if (!impl) return nullptr;
}
f.close(); f.close();
// Construct and return llmodel implementation // Construct and return llmodel implementation
return impl->construct(); return impl->construct();

View File

@ -58,11 +58,6 @@ set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
add_subdirectory(../gpt4all-backend llmodel) add_subdirectory(../gpt4all-backend llmodel)
set(METAL_SHADER_FILE)
if(${CMAKE_SYSTEM_NAME} MATCHES Darwin)
set(METAL_SHADER_FILE ../gpt4all-backend/llama.cpp-mainline/ggml-metal.metal)
endif()
qt_add_executable(chat qt_add_executable(chat
main.cpp main.cpp
chat.h chat.cpp chat.h chat.cpp
@ -77,7 +72,6 @@ qt_add_executable(chat
server.h server.cpp server.h server.cpp
logger.h logger.cpp logger.h logger.cpp
sysinfo.h sysinfo.h
${METAL_SHADER_FILE}
) )
qt_add_qml_module(chat qt_add_qml_module(chat
@ -138,12 +132,6 @@ if(${CMAKE_SYSTEM_NAME} MATCHES Darwin)
) )
endif() endif()
if(METAL_SHADER_FILE)
set_target_properties(chat PROPERTIES
RESOURCE ${METAL_SHADER_FILE}
)
endif()
target_compile_definitions(chat target_compile_definitions(chat
PRIVATE $<$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>:QT_QML_DEBUG>) PRIVATE $<$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>:QT_QML_DEBUG>)
target_link_libraries(chat target_link_libraries(chat