mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2024-10-01 01:06:10 -04:00
backend: rebase llama.cpp submodule on latest upstream (#2694)
* Adds support for GPT-NeoX, Gemma 2, OpenELM, ChatGLM, and Jais architectures (all with Kompute support) * Also enables Kompute support for StarCoder2, XVERSE, Command R, and OLMo * Includes a number of Kompute resource management fixes Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
parent
398ef34a87
commit
290c629442
@ -90,25 +90,25 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
||||
else()
|
||||
set(GPT4ALL_ALLOW_NON_AVX ON)
|
||||
endif()
|
||||
set(LLAMA_AVX2 ${GPT4ALL_ALLOW_NON_AVX})
|
||||
set(LLAMA_F16C ${GPT4ALL_ALLOW_NON_AVX})
|
||||
set(LLAMA_FMA ${GPT4ALL_ALLOW_NON_AVX})
|
||||
set(GGML_AVX2 ${GPT4ALL_ALLOW_NON_AVX})
|
||||
set(GGML_F16C ${GPT4ALL_ALLOW_NON_AVX})
|
||||
set(GGML_FMA ${GPT4ALL_ALLOW_NON_AVX})
|
||||
|
||||
set(LLAMA_METAL OFF)
|
||||
set(LLAMA_KOMPUTE OFF)
|
||||
set(LLAMA_VULKAN OFF)
|
||||
set(LLAMA_CUDA OFF)
|
||||
set(LLAMA_ROCM OFF)
|
||||
set(GGML_METAL OFF)
|
||||
set(GGML_KOMPUTE OFF)
|
||||
set(GGML_VULKAN OFF)
|
||||
set(GGML_CUDA OFF)
|
||||
set(GGML_ROCM OFF)
|
||||
if (BUILD_VARIANT MATCHES metal)
|
||||
set(LLAMA_METAL ON)
|
||||
set(GGML_METAL ON)
|
||||
elseif (BUILD_VARIANT MATCHES kompute)
|
||||
set(LLAMA_KOMPUTE ON)
|
||||
set(GGML_KOMPUTE ON)
|
||||
elseif (BUILD_VARIANT MATCHES vulkan)
|
||||
set(LLAMA_VULKAN ON)
|
||||
set(GGML_VULKAN ON)
|
||||
elseif (BUILD_VARIANT MATCHES cuda)
|
||||
set(LLAMA_CUDA ON)
|
||||
set(GGML_CUDA ON)
|
||||
elseif (BUILD_VARIANT MATCHES rocm)
|
||||
set(LLAMA_HIPBLAS ON)
|
||||
set(GGML_HIPBLAS ON)
|
||||
endif()
|
||||
|
||||
# Include GGML
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit dc51763303bd2dae0a2aecf0f205f3eee3f59620
|
||||
Subproject commit 2bae44a07fddf10512005c9475b73c09d38364a2
|
@ -7,7 +7,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||
#
|
||||
# some of the options here are commented out so they can be set "dynamically" before calling include_ggml()
|
||||
|
||||
set(LLAMA_LLAMAFILE_DEFAULT ON)
|
||||
set(GGML_LLAMAFILE_DEFAULT ON)
|
||||
|
||||
# general
|
||||
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
||||
@ -22,15 +22,15 @@ option(LLAMA_GPROF "llama: enable gprof"
|
||||
option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF)
|
||||
|
||||
# instruction set specific
|
||||
#option(LLAMA_AVX "llama: enable AVX" ON)
|
||||
#option(LLAMA_AVX2 "llama: enable AVX2" ON)
|
||||
#option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
||||
#option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
|
||||
#option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
|
||||
#option(LLAMA_FMA "llama: enable FMA" ON)
|
||||
#option(GGML_AVX "ggml: enable AVX" ON)
|
||||
#option(GGML_AVX2 "ggml: enable AVX2" ON)
|
||||
#option(GGML_AVX512 "ggml: enable AVX512" OFF)
|
||||
#option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
|
||||
#option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
|
||||
#option(GGML_FMA "ggml: enable FMA" ON)
|
||||
# in MSVC F16C is implied with AVX2/AVX512
|
||||
#if (NOT MSVC)
|
||||
# option(LLAMA_F16C "llama: enable F16C" ON)
|
||||
# option(GGML_F16C "ggml: enable F16C" ON)
|
||||
#endif()
|
||||
|
||||
if (WIN32)
|
||||
@ -38,40 +38,46 @@ if (WIN32)
|
||||
endif()
|
||||
|
||||
# 3rd party libs
|
||||
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
||||
option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT})
|
||||
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||
#option(LLAMA_CUDA "llama: use CUDA" OFF)
|
||||
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
|
||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
||||
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||
"llama: max. batch size for using peer access")
|
||||
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
|
||||
#option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
||||
#option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
#option(LLAMA_VULKAN "llama: use Vulkan" OFF)
|
||||
option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF)
|
||||
option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF)
|
||||
option(LLAMA_VULKAN_VALIDATE "llama: enable Vulkan validation" OFF)
|
||||
option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests" OFF)
|
||||
#option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
||||
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
|
||||
set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
||||
"llama: metal minimum macOS version")
|
||||
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
|
||||
#option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeline parallelism")
|
||||
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
|
||||
option(GGML_BLAS "ggml: use BLAS" OFF)
|
||||
option(GGML_LLAMAFILE "ggml: use llamafile SGEMM" ${GGML_LLAMAFILE_DEFAULT})
|
||||
set(GGML_BLAS_VENDOR "Generic" CACHE STRING "ggml: BLAS library vendor")
|
||||
|
||||
#option(GGML_CUDA "ggml: use CUDA" OFF)
|
||||
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
|
||||
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
|
||||
set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
|
||||
set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
|
||||
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
|
||||
set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
|
||||
"ggml: iters./thread per block for Q2_K/Q6_K")
|
||||
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||
"ggml: max. batch size for using peer access")
|
||||
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
|
||||
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
|
||||
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
||||
option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF)
|
||||
|
||||
#option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
|
||||
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
||||
#option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
||||
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
||||
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
||||
#option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
||||
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
||||
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
|
||||
set(GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
||||
"ggml: metal minimum macOS version")
|
||||
set(GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
|
||||
#option(GGML_KOMPUTE "ggml: use Kompute" OFF)
|
||||
option(GGML_QKK_64 "ggml: use super-block size of 64 for k-quants" OFF)
|
||||
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
|
||||
|
||||
# add perf arguments
|
||||
option(LLAMA_PERF "llama: enable perf" OFF)
|
||||
option(LLAMA_PERF "llama: enable perf" OFF)
|
||||
|
||||
#
|
||||
# Compile flags
|
||||
@ -80,14 +86,14 @@ option(LLAMA_PERF "llama: enable perf"
|
||||
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES})
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
|
||||
|
||||
# enable libstdc++ assertions for debug builds
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
list(APPEND GGML_COMPILE_DEFS $<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
|
||||
endif()
|
||||
|
||||
if (APPLE AND LLAMA_ACCELERATE)
|
||||
if (APPLE AND GGML_ACCELERATE)
|
||||
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
||||
if (ACCELERATE_FRAMEWORK)
|
||||
message(STATUS "Accelerate framework found")
|
||||
@ -101,7 +107,7 @@ if (APPLE AND LLAMA_ACCELERATE)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_BLAS)
|
||||
if (GGML_BLAS)
|
||||
if (LLAMA_STATIC)
|
||||
set(BLA_STATIC ON)
|
||||
endif()
|
||||
@ -109,7 +115,7 @@ if (LLAMA_BLAS)
|
||||
set(BLA_SIZEOF_INTEGER 8)
|
||||
endif()
|
||||
|
||||
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
|
||||
set(BLA_VENDOR ${GGML_BLAS_VENDOR})
|
||||
find_package(BLAS)
|
||||
|
||||
if (BLAS_FOUND)
|
||||
@ -119,24 +125,24 @@ if (LLAMA_BLAS)
|
||||
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
|
||||
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
|
||||
find_package(PkgConfig REQUIRED)
|
||||
if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
|
||||
if (${GGML_BLAS_VENDOR} MATCHES "Generic")
|
||||
pkg_check_modules(DepBLAS REQUIRED blas)
|
||||
elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
|
||||
elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
|
||||
# As of openblas v0.3.22, the 64-bit is named openblas64.pc
|
||||
pkg_check_modules(DepBLAS openblas64)
|
||||
if (NOT DepBLAS_FOUND)
|
||||
pkg_check_modules(DepBLAS REQUIRED openblas)
|
||||
endif()
|
||||
elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
|
||||
elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
|
||||
pkg_check_modules(DepBLAS REQUIRED blis)
|
||||
elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
|
||||
elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
|
||||
pkg_check_modules(DepBLAS REQUIRED blas-atlas)
|
||||
elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS")
|
||||
elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
|
||||
pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
|
||||
elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
|
||||
elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
|
||||
# all Intel* libraries share the same include path
|
||||
pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
|
||||
elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
|
||||
elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
|
||||
# this doesn't provide pkg-config
|
||||
# suggest to assign BLAS_INCLUDE_DIRS on your own
|
||||
if ("${NVHPC_VERSION}" STREQUAL "")
|
||||
@ -170,7 +176,7 @@ if (LLAMA_BLAS)
|
||||
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_USE_OPENBLAS)
|
||||
|
||||
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
|
||||
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_BLAS_USE_MKL)
|
||||
endif()
|
||||
|
||||
@ -179,18 +185,18 @@ if (LLAMA_BLAS)
|
||||
else()
|
||||
message(WARNING "BLAS not found, please refer to "
|
||||
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
|
||||
" to set correct LLAMA_BLAS_VENDOR")
|
||||
" to set correct GGML_BLAS_VENDOR")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_LLAMAFILE)
|
||||
if (GGML_LLAMAFILE)
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_USE_LLAMAFILE)
|
||||
|
||||
set(GGML_HEADERS_LLAMAFILE ${DIRECTORY}/sgemm.h)
|
||||
set(GGML_SOURCES_LLAMAFILE ${DIRECTORY}/sgemm.cpp)
|
||||
set(GGML_HEADERS_LLAMAFILE ${DIRECTORY}/ggml/src/llamafile/sgemm.h)
|
||||
set(GGML_SOURCES_LLAMAFILE ${DIRECTORY}/ggml/src/llamafile/sgemm.cpp)
|
||||
endif()
|
||||
|
||||
if (LLAMA_QKK_64)
|
||||
if (GGML_QKK_64)
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_QKK_64)
|
||||
endif()
|
||||
|
||||
@ -361,8 +367,9 @@ function(include_ggml SUFFIX)
|
||||
# libraries
|
||||
#
|
||||
|
||||
if (LLAMA_CUDA)
|
||||
cmake_minimum_required(VERSION 3.17)
|
||||
if (GGML_CUDA)
|
||||
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
|
||||
|
||||
get_property(LANGS GLOBAL PROPERTY ENABLED_LANGUAGES)
|
||||
if (NOT CUDA IN_LIST LANGS)
|
||||
message(FATAL_ERROR "The CUDA language must be enabled.")
|
||||
@ -376,35 +383,71 @@ function(include_ggml SUFFIX)
|
||||
# 60 == f16 CUDA intrinsics
|
||||
# 61 == integer CUDA intrinsics
|
||||
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
|
||||
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
||||
set(GGML_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
||||
if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
|
||||
set(GGML_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics
|
||||
else()
|
||||
set(GGML_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||
set(GGML_CUDA_ARCHITECTURES "52;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||
#set(GGML_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "Using CUDA architectures: ${GGML_CUDA_ARCHITECTURES}")
|
||||
|
||||
set(GGML_HEADERS_CUDA ${DIRECTORY}/ggml-cuda.h)
|
||||
set(GGML_HEADERS_CUDA ${DIRECTORY}/ggml/include/ggml-cuda.h)
|
||||
file(GLOB GGML_HEADERS_CUDA "${DIRECTORY}/ggml/src/ggml-cuda/*.cuh")
|
||||
list(APPEND GGML_HEADERS_CUDA "${DIRECTORY}/ggml/include/ggml-cuda.h")
|
||||
|
||||
file(GLOB GGML_SOURCES_CUDA "${DIRECTORY}/ggml-cuda/*.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA "${DIRECTORY}/ggml-cuda.cu")
|
||||
file(GLOB GGML_SOURCES_CUDA "${DIRECTORY}/ggml/src/ggml-cuda/*.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA "${DIRECTORY}/ggml/src/ggml-cuda.cu")
|
||||
file(GLOB SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
file(GLOB SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/mmq*.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
|
||||
if (GGML_CUDA_FA_ALL_QUANTS)
|
||||
file(GLOB SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
|
||||
else()
|
||||
file(GLOB SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
file(GLOB SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
file(GLOB SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
endif()
|
||||
|
||||
list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_CUDA)
|
||||
if (LLAMA_CUDA_FORCE_DMMV)
|
||||
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
|
||||
list(APPEND GGML_COMPILE_DEFS K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
|
||||
if (GGML_CUDA_USE_GRAPHS)
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_USE_GRAPHS)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_FORCE_DMMV)
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_DMMV)
|
||||
endif()
|
||||
if (LLAMA_CUDA_FORCE_MMQ)
|
||||
|
||||
if (GGML_CUDA_FORCE_MMQ)
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||
if (LLAMA_CUDA_F16)
|
||||
|
||||
if (GGML_CUDA_FORCE_CUBLAS)
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_CUBLAS)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_NO_VMM)
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_NO_VMM)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_F16)
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_F16)
|
||||
endif()
|
||||
list(APPEND GGML_COMPILE_DEFS K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
if (LLAMA_CUDA_NO_PEER_COPY)
|
||||
|
||||
if (GGML_CUDA_NO_PEER_COPY)
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_NO_PEER_COPY)
|
||||
endif()
|
||||
|
||||
@ -422,45 +465,34 @@ function(include_ggml SUFFIX)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CLBLAST)
|
||||
find_package(CLBlast REQUIRED)
|
||||
|
||||
set(GGML_HEADERS_OPENCL ${DIRECTORY}/ggml-opencl.h)
|
||||
set(GGML_SOURCES_OPENCL ${DIRECTORY}/ggml-opencl.cpp)
|
||||
|
||||
list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_CLBLAST)
|
||||
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
|
||||
endif()
|
||||
|
||||
if (LLAMA_VULKAN)
|
||||
if (GGML_VULKAN)
|
||||
find_package(Vulkan REQUIRED)
|
||||
|
||||
set(GGML_HEADERS_VULKAN ${DIRECTORY}/ggml-vulkan.h)
|
||||
set(GGML_SOURCES_VULKAN ${DIRECTORY}/ggml-vulkan.cpp)
|
||||
set(GGML_HEADERS_VULKAN ${DIRECTORY}/ggml/include/ggml-vulkan.h)
|
||||
set(GGML_SOURCES_VULKAN ${DIRECTORY}/ggml/src/ggml-vulkan.cpp)
|
||||
|
||||
list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_VULKAN)
|
||||
|
||||
if (LLAMA_VULKAN_CHECK_RESULTS)
|
||||
if (GGML_VULKAN_CHECK_RESULTS)
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_CHECK_RESULTS)
|
||||
endif()
|
||||
|
||||
if (LLAMA_VULKAN_DEBUG)
|
||||
if (GGML_VULKAN_DEBUG)
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_DEBUG)
|
||||
endif()
|
||||
|
||||
if (LLAMA_VULKAN_VALIDATE)
|
||||
if (GGML_VULKAN_VALIDATE)
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_VALIDATE)
|
||||
endif()
|
||||
|
||||
if (LLAMA_VULKAN_RUN_TESTS)
|
||||
if (GGML_VULKAN_RUN_TESTS)
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_RUN_TESTS)
|
||||
endif()
|
||||
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} Vulkan::Vulkan)
|
||||
endif()
|
||||
|
||||
if (LLAMA_HIPBLAS)
|
||||
if (GGML_HIPBLAS)
|
||||
if ($ENV{ROCM_PATH})
|
||||
set(ROCM_PATH $ENV{ROCM_PATH})
|
||||
else()
|
||||
@ -490,32 +522,32 @@ function(include_ggml SUFFIX)
|
||||
|
||||
message(STATUS "HIP and hipBLAS found")
|
||||
|
||||
set(GGML_HEADERS_ROCM ${DIRECTORY}/ggml-cuda.h)
|
||||
set(GGML_HEADERS_ROCM ${DIRECTORY}/ggml/include/ggml-cuda.h)
|
||||
|
||||
file(GLOB GGML_SOURCES_ROCM "${DIRECTORY}/ggml-rocm/*.cu")
|
||||
list(APPEND GGML_SOURCES_ROCM "${DIRECTORY}/ggml-rocm.cu")
|
||||
file(GLOB GGML_SOURCES_ROCM "${DIRECTORY}/ggml/src/ggml-rocm/*.cu")
|
||||
list(APPEND GGML_SOURCES_ROCM "${DIRECTORY}/ggml/src/ggml-rocm.cu")
|
||||
|
||||
list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_HIPBLAS GGML_USE_CUDA)
|
||||
|
||||
if (LLAMA_HIP_UMA)
|
||||
if (GGML_HIP_UMA)
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_HIP_UMA)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CUDA_FORCE_DMMV)
|
||||
if (GGML_CUDA_FORCE_DMMV)
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_DMMV)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CUDA_FORCE_MMQ)
|
||||
if (GGML_CUDA_FORCE_MMQ)
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CUDA_NO_PEER_COPY)
|
||||
if (GGML_CUDA_NO_PEER_COPY)
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_NO_PEER_COPY)
|
||||
endif()
|
||||
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||
list(APPEND GGML_COMPILE_DEFS K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
|
||||
list(APPEND GGML_COMPILE_DEFS K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
|
||||
|
||||
if (CXX_IS_HIPCC)
|
||||
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
||||
@ -533,9 +565,9 @@ function(include_ggml SUFFIX)
|
||||
|
||||
set(LLAMA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY})
|
||||
|
||||
if (LLAMA_KOMPUTE AND NOT GGML_KOMPUTE_ONCE)
|
||||
if (GGML_KOMPUTE AND NOT GGML_KOMPUTE_ONCE)
|
||||
set(GGML_KOMPUTE_ONCE ON PARENT_SCOPE)
|
||||
if (NOT EXISTS "${LLAMA_DIR}/kompute/CMakeLists.txt")
|
||||
if (NOT EXISTS "${LLAMA_DIR}/ggml/src/kompute/CMakeLists.txt")
|
||||
message(FATAL_ERROR "Kompute not found")
|
||||
endif()
|
||||
message(STATUS "Kompute found")
|
||||
@ -559,12 +591,12 @@ function(include_ggml SUFFIX)
|
||||
set(spv_file ${CMAKE_CURRENT_BINARY_DIR}/${OP_FILE}.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${spv_file}
|
||||
DEPENDS ${LLAMA_DIR}/${source}
|
||||
${LLAMA_DIR}/kompute-shaders/common.comp
|
||||
${LLAMA_DIR}/kompute-shaders/op_getrows.comp
|
||||
${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
|
||||
${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n.comp
|
||||
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/${source}
|
||||
DEPENDS ${LLAMA_DIR}/ggml/src/kompute-shaders/${source}
|
||||
${LLAMA_DIR}/ggml/src/kompute-shaders/common.comp
|
||||
${LLAMA_DIR}/ggml/src/kompute-shaders/op_getrows.comp
|
||||
${LLAMA_DIR}/ggml/src/kompute-shaders/op_mul_mv_q_n_pre.comp
|
||||
${LLAMA_DIR}/ggml/src/kompute-shaders/op_mul_mv_q_n.comp
|
||||
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/ggml/src/kompute-shaders/${source}
|
||||
COMMENT "Compiling ${source} to ${source}.spv"
|
||||
)
|
||||
|
||||
@ -610,39 +642,39 @@ function(include_ggml SUFFIX)
|
||||
set(KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG "v1.3.239" CACHE STRING "Kompute Vulkan headers tag")
|
||||
set(KOMPUTE_OPT_LOG_LEVEL Critical CACHE STRING "Kompute log level")
|
||||
set(FMT_INSTALL OFF)
|
||||
add_subdirectory(${LLAMA_DIR}/kompute)
|
||||
add_subdirectory(${LLAMA_DIR}/ggml/src/kompute)
|
||||
|
||||
# Compile our shaders
|
||||
compile_shader(SOURCES
|
||||
kompute-shaders/op_scale.comp
|
||||
kompute-shaders/op_scale_8.comp
|
||||
kompute-shaders/op_add.comp
|
||||
kompute-shaders/op_addrow.comp
|
||||
kompute-shaders/op_mul.comp
|
||||
kompute-shaders/op_silu.comp
|
||||
kompute-shaders/op_relu.comp
|
||||
kompute-shaders/op_gelu.comp
|
||||
kompute-shaders/op_softmax.comp
|
||||
kompute-shaders/op_norm.comp
|
||||
kompute-shaders/op_rmsnorm.comp
|
||||
kompute-shaders/op_diagmask.comp
|
||||
kompute-shaders/op_mul_mat_mat_f32.comp
|
||||
kompute-shaders/op_mul_mat_f16.comp
|
||||
kompute-shaders/op_mul_mat_q8_0.comp
|
||||
kompute-shaders/op_mul_mat_q4_0.comp
|
||||
kompute-shaders/op_mul_mat_q4_1.comp
|
||||
kompute-shaders/op_mul_mat_q6_k.comp
|
||||
kompute-shaders/op_getrows_f32.comp
|
||||
kompute-shaders/op_getrows_f16.comp
|
||||
kompute-shaders/op_getrows_q4_0.comp
|
||||
kompute-shaders/op_getrows_q4_1.comp
|
||||
kompute-shaders/op_getrows_q6_k.comp
|
||||
kompute-shaders/op_rope_f16.comp
|
||||
kompute-shaders/op_rope_f32.comp
|
||||
kompute-shaders/op_cpy_f16_f16.comp
|
||||
kompute-shaders/op_cpy_f16_f32.comp
|
||||
kompute-shaders/op_cpy_f32_f16.comp
|
||||
kompute-shaders/op_cpy_f32_f32.comp
|
||||
op_scale.comp
|
||||
op_scale_8.comp
|
||||
op_add.comp
|
||||
op_addrow.comp
|
||||
op_mul.comp
|
||||
op_silu.comp
|
||||
op_relu.comp
|
||||
op_gelu.comp
|
||||
op_softmax.comp
|
||||
op_norm.comp
|
||||
op_rmsnorm.comp
|
||||
op_diagmask.comp
|
||||
op_mul_mat_mat_f32.comp
|
||||
op_mul_mat_f16.comp
|
||||
op_mul_mat_q8_0.comp
|
||||
op_mul_mat_q4_0.comp
|
||||
op_mul_mat_q4_1.comp
|
||||
op_mul_mat_q6_k.comp
|
||||
op_getrows_f32.comp
|
||||
op_getrows_f16.comp
|
||||
op_getrows_q4_0.comp
|
||||
op_getrows_q4_1.comp
|
||||
op_getrows_q6_k.comp
|
||||
op_rope_f16.comp
|
||||
op_rope_f32.comp
|
||||
op_cpy_f16_f16.comp
|
||||
op_cpy_f16_f32.comp
|
||||
op_cpy_f32_f16.comp
|
||||
op_cpy_f32_f32.comp
|
||||
)
|
||||
|
||||
# Create a custom target for our generated shaders
|
||||
@ -687,12 +719,12 @@ function(include_ggml SUFFIX)
|
||||
)
|
||||
endif()
|
||||
|
||||
if (LLAMA_KOMPUTE)
|
||||
if (GGML_KOMPUTE)
|
||||
list(APPEND GGML_COMPILE_DEFS VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
|
||||
|
||||
# Add the stamp to the main sources to ensure dependency tracking
|
||||
set(GGML_SOURCES_KOMPUTE ${LLAMA_DIR}/ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
|
||||
set(GGML_HEADERS_KOMPUTE ${LLAMA_DIR}/ggml-kompute.h)
|
||||
set(GGML_SOURCES_KOMPUTE ${LLAMA_DIR}/ggml/src/ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
|
||||
set(GGML_HEADERS_KOMPUTE ${LLAMA_DIR}/ggml/include/ggml-kompute.h)
|
||||
|
||||
list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_KOMPUTE)
|
||||
|
||||
@ -701,7 +733,7 @@ function(include_ggml SUFFIX)
|
||||
|
||||
set(CUDA_CXX_FLAGS "")
|
||||
|
||||
if (LLAMA_CUDA)
|
||||
if (GGML_CUDA)
|
||||
set(CUDA_FLAGS -use_fast_math)
|
||||
|
||||
if (LLAMA_FATAL_WARNINGS)
|
||||
@ -748,25 +780,25 @@ function(include_ggml SUFFIX)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_METAL)
|
||||
if (GGML_METAL)
|
||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
|
||||
message(STATUS "Metal framework found")
|
||||
set(GGML_HEADERS_METAL ${DIRECTORY}/ggml-metal.h)
|
||||
set(GGML_SOURCES_METAL ${DIRECTORY}/ggml-metal.m)
|
||||
set(GGML_HEADERS_METAL ${DIRECTORY}/ggml/include/ggml-metal.h)
|
||||
set(GGML_SOURCES_METAL ${DIRECTORY}/ggml/src/ggml-metal.m)
|
||||
|
||||
list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_METAL)
|
||||
if (LLAMA_METAL_NDEBUG)
|
||||
if (GGML_METAL_NDEBUG)
|
||||
list(APPEND GGML_COMPILE_DEFS GGML_METAL_NDEBUG)
|
||||
endif()
|
||||
|
||||
# copy ggml-common.h and ggml-metal.metal to bin directory
|
||||
configure_file(${DIRECTORY}/ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY)
|
||||
configure_file(${DIRECTORY}/ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
|
||||
configure_file(${DIRECTORY}/ggml/src/ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY)
|
||||
configure_file(${DIRECTORY}/ggml/src/ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
|
||||
|
||||
if (LLAMA_METAL_SHADER_DEBUG)
|
||||
if (GGML_METAL_SHADER_DEBUG)
|
||||
# custom command to do the following:
|
||||
# xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
|
||||
# xcrun -sdk macosx metallib ggml-metal.air -o default.metallib
|
||||
@ -782,13 +814,13 @@ function(include_ggml SUFFIX)
|
||||
endif()
|
||||
|
||||
# Append macOS metal versioning flags
|
||||
if (LLAMA_METAL_MACOSX_VERSION_MIN)
|
||||
message(STATUS "Adding -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN} flag to metal compilation")
|
||||
list(APPEND XC_FLAGS -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN})
|
||||
if (GGML_METAL_MACOSX_VERSION_MIN)
|
||||
message(STATUS "Adding -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN} flag to metal compilation")
|
||||
list(APPEND XC_FLAGS -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN})
|
||||
endif()
|
||||
if (LLAMA_METAL_STD)
|
||||
message(STATUS "Adding -std=${LLAMA_METAL_STD} flag to metal compilation")
|
||||
list(APPEND XC_FLAGS -std=${LLAMA_METAL_STD})
|
||||
if (GGML_METAL_STD)
|
||||
message(STATUS "Adding -std=${GGML_METAL_STD} flag to metal compilation")
|
||||
list(APPEND XC_FLAGS -std=${GGML_METAL_STD})
|
||||
endif()
|
||||
|
||||
set(GGML_METALLIB ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib)
|
||||
@ -799,7 +831,7 @@ function(include_ggml SUFFIX)
|
||||
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
|
||||
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
|
||||
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
|
||||
DEPENDS ${DIRECTORY}/ggml-metal.metal ${DIRECTORY}/ggml-common.h
|
||||
DEPENDS ${DIRECTORY}/ggml/src/ggml-metal.metal ${DIRECTORY}/ggml/src/ggml-common.h
|
||||
COMMENT "Compiling Metal kernels"
|
||||
)
|
||||
set_source_files_properties(${GGML_METALLIB} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES GENERATED ON)
|
||||
@ -853,49 +885,49 @@ function(include_ggml SUFFIX)
|
||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
|
||||
message(STATUS "x86 detected")
|
||||
if (MSVC)
|
||||
if (LLAMA_AVX512)
|
||||
if (GGML_AVX512)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX512)
|
||||
# MSVC has no compile-time flags enabling specific
|
||||
# AVX512 extensions, neither it defines the
|
||||
# macros corresponding to the extensions.
|
||||
# Do it manually.
|
||||
if (LLAMA_AVX512_VBMI)
|
||||
if (GGML_AVX512_VBMI)
|
||||
list(APPEND GGML_COMPILE_DEFS $<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
|
||||
list(APPEND GGML_COMPILE_DEFS $<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VNNI)
|
||||
if (GGML_AVX512_VNNI)
|
||||
list(APPEND GGML_COMPILE_DEFS $<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
|
||||
list(APPEND GGML_COMPILE_DEFS $<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
||||
endif()
|
||||
elseif (LLAMA_AVX2)
|
||||
elseif (GGML_AVX2)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX2)
|
||||
elseif (LLAMA_AVX)
|
||||
elseif (GGML_AVX)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX)
|
||||
endif()
|
||||
else()
|
||||
if (LLAMA_NATIVE)
|
||||
if (GGML_NATIVE)
|
||||
list(APPEND ARCH_FLAGS -march=native)
|
||||
endif()
|
||||
if (LLAMA_F16C)
|
||||
if (GGML_F16C)
|
||||
list(APPEND ARCH_FLAGS -mf16c)
|
||||
endif()
|
||||
if (LLAMA_FMA)
|
||||
if (GGML_FMA)
|
||||
list(APPEND ARCH_FLAGS -mfma)
|
||||
endif()
|
||||
if (LLAMA_AVX)
|
||||
if (GGML_AVX)
|
||||
list(APPEND ARCH_FLAGS -mavx)
|
||||
endif()
|
||||
if (LLAMA_AVX2)
|
||||
if (GGML_AVX2)
|
||||
list(APPEND ARCH_FLAGS -mavx2)
|
||||
endif()
|
||||
if (LLAMA_AVX512)
|
||||
if (GGML_AVX512)
|
||||
list(APPEND ARCH_FLAGS -mavx512f)
|
||||
list(APPEND ARCH_FLAGS -mavx512bw)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VBMI)
|
||||
if (GGML_AVX512_VBMI)
|
||||
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VNNI)
|
||||
if (GGML_AVX512_VNNI)
|
||||
list(APPEND ARCH_FLAGS -mavx512vnni)
|
||||
endif()
|
||||
endif()
|
||||
@ -914,7 +946,7 @@ function(include_ggml SUFFIX)
|
||||
list(APPEND GGML_COMPILE_OPTS "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
|
||||
list(APPEND GGML_COMPILE_OPTS "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
|
||||
|
||||
if (LLAMA_CUDA)
|
||||
if (GGML_CUDA)
|
||||
list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
|
||||
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
|
||||
if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
|
||||
@ -926,24 +958,26 @@ function(include_ggml SUFFIX)
|
||||
# ggml
|
||||
|
||||
add_library(ggml${SUFFIX} OBJECT
|
||||
${DIRECTORY}/ggml.c
|
||||
${DIRECTORY}/ggml.h
|
||||
${DIRECTORY}/ggml-alloc.c
|
||||
${DIRECTORY}/ggml-alloc.h
|
||||
${DIRECTORY}/ggml-backend.c
|
||||
${DIRECTORY}/ggml-backend.h
|
||||
${DIRECTORY}/ggml-quants.c
|
||||
${DIRECTORY}/ggml-quants.h
|
||||
${DIRECTORY}/ggml/include/ggml.h
|
||||
${DIRECTORY}/ggml/include/ggml-alloc.h
|
||||
${DIRECTORY}/ggml/include/ggml-backend.h
|
||||
${DIRECTORY}/ggml/src/ggml.c
|
||||
${DIRECTORY}/ggml/src/ggml-alloc.c
|
||||
${DIRECTORY}/ggml/src/ggml-backend.c
|
||||
${DIRECTORY}/ggml/src/ggml-quants.c
|
||||
${DIRECTORY}/ggml/src/ggml-quants.h
|
||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
|
||||
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
|
||||
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
|
||||
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
|
||||
${DIRECTORY}/ggml/src/ggml-aarch64.c
|
||||
${DIRECTORY}/ggml/src/ggml-aarch64.h
|
||||
)
|
||||
|
||||
target_include_directories(ggml${SUFFIX} PUBLIC ${DIRECTORY} ${LLAMA_EXTRA_INCLUDES})
|
||||
target_include_directories(ggml${SUFFIX} PUBLIC ${DIRECTORY}/ggml/include ${LLAMA_EXTRA_INCLUDES})
|
||||
target_include_directories(ggml${SUFFIX} PRIVATE ${DIRECTORY}/ggml/src)
|
||||
target_compile_features(ggml${SUFFIX} PUBLIC c_std_11) # don't bump
|
||||
|
||||
target_link_libraries(ggml${SUFFIX} PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||
@ -955,14 +989,15 @@ function(include_ggml SUFFIX)
|
||||
# llama
|
||||
|
||||
add_library(llama${SUFFIX} STATIC
|
||||
${DIRECTORY}/llama.cpp
|
||||
${DIRECTORY}/llama.h
|
||||
${DIRECTORY}/unicode.h
|
||||
${DIRECTORY}/unicode.cpp
|
||||
${DIRECTORY}/unicode-data.cpp
|
||||
${DIRECTORY}/include/llama.h
|
||||
${DIRECTORY}/src/llama.cpp
|
||||
${DIRECTORY}/src/unicode.h
|
||||
${DIRECTORY}/src/unicode.cpp
|
||||
${DIRECTORY}/src/unicode-data.cpp
|
||||
)
|
||||
|
||||
target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY})
|
||||
target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY}/include ${DIRECTORY}/ggml/include)
|
||||
target_include_directories(llama${SUFFIX} PRIVATE ${DIRECTORY}/src)
|
||||
target_compile_features (llama${SUFFIX} PUBLIC cxx_std_11) # don't bump
|
||||
|
||||
target_link_libraries(llama${SUFFIX} PRIVATE
|
||||
|
@ -30,9 +30,9 @@
|
||||
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
# include <ggml-kompute.h>
|
||||
#elif GGML_USE_VULKAN
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
# include <ggml-vulkan.h>
|
||||
#elif GGML_USE_CUDA
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
# include <ggml-cuda.h>
|
||||
#endif
|
||||
|
||||
@ -51,14 +51,14 @@ static const std::vector<const char *> KNOWN_ARCHES {
|
||||
// "grok", -- 314B parameters
|
||||
"gpt2",
|
||||
// "gptj", -- no inference code
|
||||
// "gptneox", -- no inference code
|
||||
"gptneox",
|
||||
"mpt",
|
||||
"baichuan",
|
||||
"starcoder",
|
||||
// "persimmon", -- CUDA generates garbage
|
||||
"refact",
|
||||
"bert",
|
||||
"nomic-bert",
|
||||
// "jina-bert-v2", -- Assertion `i01 >= 0 && i01 < ne01' failed.
|
||||
"bloom",
|
||||
"stablelm",
|
||||
"qwen",
|
||||
@ -72,12 +72,20 @@ static const std::vector<const char *> KNOWN_ARCHES {
|
||||
"internlm2",
|
||||
// "minicpm", -- CUDA generates garbage
|
||||
"gemma",
|
||||
"gemma2",
|
||||
"starcoder2",
|
||||
// "mamba", -- CUDA missing SSM_CONV
|
||||
"xverse",
|
||||
"command-r",
|
||||
// "dbrx", -- 16x12B parameters
|
||||
"olmo",
|
||||
"openelm",
|
||||
// "arctic", -- 10B+128x3.66B parameters
|
||||
// "deepseek2", -- excessive VRAM requirements
|
||||
"chatglm",
|
||||
// "bitnet", -- tensor not within file bounds?
|
||||
// "t5", -- seq2seq model
|
||||
"jais",
|
||||
};
|
||||
|
||||
static const std::vector<const char *> EMBEDDING_ARCHES {
|
||||
@ -103,6 +111,16 @@ static void llama_log_callback(enum ggml_log_level level, const char *text, void
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
static void cuda_log_callback(enum ggml_log_level level, const char *text, void *userdata)
|
||||
{
|
||||
(void)userdata;
|
||||
if (llama_verbose() || level <= GGML_LOG_LEVEL_WARN) {
|
||||
fputs(text, stderr);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
struct gpt_params {
|
||||
int32_t seed = -1; // RNG seed
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
@ -515,9 +533,8 @@ std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::
|
||||
{
|
||||
const bool wantBOS = ctx.n_past == 0 && ctx.tokens.empty();
|
||||
const bool useBOS = wantBOS && shouldAddBOS();
|
||||
auto strCat = wantBOS && !special ? " " + str : str; // insert leading space ourselves, llama.cpp fork doesn't anymore
|
||||
std::vector<LLModel::Token> fres(strCat.size()+4);
|
||||
auto fres_len = llama_tokenize(d_ptr->model, strCat.c_str(), strCat.length(), fres.data(), fres.size(), useBOS, special);
|
||||
std::vector<LLModel::Token> fres(str.length() + 4);
|
||||
auto fres_len = llama_tokenize(d_ptr->model, str.c_str(), str.length(), fres.data(), fres.size(), useBOS, special);
|
||||
fres.resize(fres_len);
|
||||
return fres;
|
||||
}
|
||||
@ -525,10 +542,10 @@ std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::
|
||||
std::string LLamaModel::tokenToString(Token id) const
|
||||
{
|
||||
std::vector<char> result(8, 0);
|
||||
const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
|
||||
const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), 0, false);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
|
||||
int check = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), 0, false);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
}
|
||||
else {
|
||||
@ -1170,6 +1187,9 @@ DLL_EXPORT bool is_arch_supported(const char *arch)
|
||||
DLL_EXPORT LLModel *construct()
|
||||
{
|
||||
llama_log_set(llama_log_callback, nullptr);
|
||||
#ifdef GGML_USE_CUDA
|
||||
ggml_backend_cuda_log_set_callback(cuda_log_callback, nullptr);
|
||||
#endif
|
||||
return new LLamaModel;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user