backend: rebase llama.cpp submodule on latest upstream (#2694)

* Adds support for GPT-NeoX, Gemma 2, OpenELM, ChatGLM, and Jais architectures (all with Kompute support)
* Also enables Kompute support for StarCoder2, XVERSE, Command R, and OLMo
* Includes a number of Kompute resource management fixes

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel 2024-07-19 14:52:58 -04:00 committed by GitHub
parent 398ef34a87
commit 290c629442
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 266 additions and 211 deletions

View File

@ -90,25 +90,25 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
else()
set(GPT4ALL_ALLOW_NON_AVX ON)
endif()
set(LLAMA_AVX2 ${GPT4ALL_ALLOW_NON_AVX})
set(LLAMA_F16C ${GPT4ALL_ALLOW_NON_AVX})
set(LLAMA_FMA ${GPT4ALL_ALLOW_NON_AVX})
set(GGML_AVX2 ${GPT4ALL_ALLOW_NON_AVX})
set(GGML_F16C ${GPT4ALL_ALLOW_NON_AVX})
set(GGML_FMA ${GPT4ALL_ALLOW_NON_AVX})
set(LLAMA_METAL OFF)
set(LLAMA_KOMPUTE OFF)
set(LLAMA_VULKAN OFF)
set(LLAMA_CUDA OFF)
set(LLAMA_ROCM OFF)
set(GGML_METAL OFF)
set(GGML_KOMPUTE OFF)
set(GGML_VULKAN OFF)
set(GGML_CUDA OFF)
set(GGML_ROCM OFF)
if (BUILD_VARIANT MATCHES metal)
set(LLAMA_METAL ON)
set(GGML_METAL ON)
elseif (BUILD_VARIANT MATCHES kompute)
set(LLAMA_KOMPUTE ON)
set(GGML_KOMPUTE ON)
elseif (BUILD_VARIANT MATCHES vulkan)
set(LLAMA_VULKAN ON)
set(GGML_VULKAN ON)
elseif (BUILD_VARIANT MATCHES cuda)
set(LLAMA_CUDA ON)
set(GGML_CUDA ON)
elseif (BUILD_VARIANT MATCHES rocm)
set(LLAMA_HIPBLAS ON)
set(GGML_HIPBLAS ON)
endif()
# Include GGML

@ -1 +1 @@
Subproject commit dc51763303bd2dae0a2aecf0f205f3eee3f59620
Subproject commit 2bae44a07fddf10512005c9475b73c09d38364a2

View File

@ -7,7 +7,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
#
# some of the options here are commented out so they can be set "dynamically" before calling include_ggml()
set(LLAMA_LLAMAFILE_DEFAULT ON)
set(GGML_LLAMAFILE_DEFAULT ON)
# general
option(LLAMA_STATIC "llama: static link libraries" OFF)
@ -22,15 +22,15 @@ option(LLAMA_GPROF "llama: enable gprof"
option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF)
# instruction set specific
#option(LLAMA_AVX "llama: enable AVX" ON)
#option(LLAMA_AVX2 "llama: enable AVX2" ON)
#option(LLAMA_AVX512 "llama: enable AVX512" OFF)
#option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
#option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
#option(LLAMA_FMA "llama: enable FMA" ON)
#option(GGML_AVX "ggml: enable AVX" ON)
#option(GGML_AVX2 "ggml: enable AVX2" ON)
#option(GGML_AVX512 "ggml: enable AVX512" OFF)
#option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
#option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
#option(GGML_FMA "ggml: enable FMA" ON)
# in MSVC F16C is implied with AVX2/AVX512
#if (NOT MSVC)
# option(LLAMA_F16C "llama: enable F16C" ON)
# option(GGML_F16C "ggml: enable F16C" ON)
#endif()
if (WIN32)
@ -38,37 +38,43 @@ if (WIN32)
endif()
# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT})
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
#option(LLAMA_CUDA "llama: use CUDA" OFF)
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"llama: max. batch size for using peer access")
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
#option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
#option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
#option(LLAMA_VULKAN "llama: use Vulkan" OFF)
option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF)
option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF)
option(LLAMA_VULKAN_VALIDATE "llama: enable Vulkan validation" OFF)
option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests" OFF)
#option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
"llama: metal minimum macOS version")
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
#option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeline parallelism")
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
option(GGML_BLAS "ggml: use BLAS" OFF)
option(GGML_LLAMAFILE "ggml: use llamafile SGEMM" ${GGML_LLAMAFILE_DEFAULT})
set(GGML_BLAS_VENDOR "Generic" CACHE STRING "ggml: BLAS library vendor")
#option(GGML_CUDA "ggml: use CUDA" OFF)
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
"ggml: iters./thread per block for Q2_K/Q6_K")
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"ggml: max. batch size for using peer access")
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF)
#option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
#option(GGML_VULKAN "ggml: use Vulkan" OFF)
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
#option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
set(GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
"ggml: metal minimum macOS version")
set(GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
#option(GGML_KOMPUTE "ggml: use Kompute" OFF)
option(GGML_QKK_64 "ggml: use super-block size of 64 for k-quants" OFF)
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
# add perf arguments
option(LLAMA_PERF "llama: enable perf" OFF)
@ -80,14 +86,14 @@ option(LLAMA_PERF "llama: enable perf"
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
list(APPEND GGML_COMPILE_DEFS GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES})
list(APPEND GGML_COMPILE_DEFS GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
# enable libstdc++ assertions for debug builds
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
list(APPEND GGML_COMPILE_DEFS $<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
endif()
if (APPLE AND LLAMA_ACCELERATE)
if (APPLE AND GGML_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate)
if (ACCELERATE_FRAMEWORK)
message(STATUS "Accelerate framework found")
@ -101,7 +107,7 @@ if (APPLE AND LLAMA_ACCELERATE)
endif()
endif()
if (LLAMA_BLAS)
if (GGML_BLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
endif()
@ -109,7 +115,7 @@ if (LLAMA_BLAS)
set(BLA_SIZEOF_INTEGER 8)
endif()
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
set(BLA_VENDOR ${GGML_BLAS_VENDOR})
find_package(BLAS)
if (BLAS_FOUND)
@ -119,24 +125,24 @@ if (LLAMA_BLAS)
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
find_package(PkgConfig REQUIRED)
if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
if (${GGML_BLAS_VENDOR} MATCHES "Generic")
pkg_check_modules(DepBLAS REQUIRED blas)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
# As of openblas v0.3.22, the 64-bit is named openblas64.pc
pkg_check_modules(DepBLAS openblas64)
if (NOT DepBLAS_FOUND)
pkg_check_modules(DepBLAS REQUIRED openblas)
endif()
elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
pkg_check_modules(DepBLAS REQUIRED blis)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
pkg_check_modules(DepBLAS REQUIRED blas-atlas)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS")
elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
# all Intel* libraries share the same include path
pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
# this doesn't provide pkg-config
# suggest to assign BLAS_INCLUDE_DIRS on your own
if ("${NVHPC_VERSION}" STREQUAL "")
@ -170,7 +176,7 @@ if (LLAMA_BLAS)
list(APPEND GGML_COMPILE_DEFS GGML_USE_OPENBLAS)
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
list(APPEND GGML_COMPILE_DEFS GGML_BLAS_USE_MKL)
endif()
@ -179,18 +185,18 @@ if (LLAMA_BLAS)
else()
message(WARNING "BLAS not found, please refer to "
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
" to set correct LLAMA_BLAS_VENDOR")
" to set correct GGML_BLAS_VENDOR")
endif()
endif()
if (LLAMA_LLAMAFILE)
if (GGML_LLAMAFILE)
list(APPEND GGML_COMPILE_DEFS GGML_USE_LLAMAFILE)
set(GGML_HEADERS_LLAMAFILE ${DIRECTORY}/sgemm.h)
set(GGML_SOURCES_LLAMAFILE ${DIRECTORY}/sgemm.cpp)
set(GGML_HEADERS_LLAMAFILE ${DIRECTORY}/ggml/src/llamafile/sgemm.h)
set(GGML_SOURCES_LLAMAFILE ${DIRECTORY}/ggml/src/llamafile/sgemm.cpp)
endif()
if (LLAMA_QKK_64)
if (GGML_QKK_64)
list(APPEND GGML_COMPILE_DEFS GGML_QKK_64)
endif()
@ -361,8 +367,9 @@ function(include_ggml SUFFIX)
# libraries
#
if (LLAMA_CUDA)
cmake_minimum_required(VERSION 3.17)
if (GGML_CUDA)
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
get_property(LANGS GLOBAL PROPERTY ENABLED_LANGUAGES)
if (NOT CUDA IN_LIST LANGS)
message(FATAL_ERROR "The CUDA language must be enabled.")
@ -376,35 +383,71 @@ function(include_ggml SUFFIX)
# 60 == f16 CUDA intrinsics
# 61 == integer CUDA intrinsics
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
set(GGML_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
set(GGML_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics
else()
set(GGML_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
set(GGML_CUDA_ARCHITECTURES "52;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
#set(GGML_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
endif()
endif()
message(STATUS "Using CUDA architectures: ${GGML_CUDA_ARCHITECTURES}")
set(GGML_HEADERS_CUDA ${DIRECTORY}/ggml-cuda.h)
set(GGML_HEADERS_CUDA ${DIRECTORY}/ggml/include/ggml-cuda.h)
file(GLOB GGML_HEADERS_CUDA "${DIRECTORY}/ggml/src/ggml-cuda/*.cuh")
list(APPEND GGML_HEADERS_CUDA "${DIRECTORY}/ggml/include/ggml-cuda.h")
file(GLOB GGML_SOURCES_CUDA "${DIRECTORY}/ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_CUDA "${DIRECTORY}/ggml-cuda.cu")
file(GLOB GGML_SOURCES_CUDA "${DIRECTORY}/ggml/src/ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_CUDA "${DIRECTORY}/ggml/src/ggml-cuda.cu")
file(GLOB SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/mmq*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
if (GGML_CUDA_FA_ALL_QUANTS)
file(GLOB SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
else()
file(GLOB SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
endif()
list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_CUDA)
if (LLAMA_CUDA_FORCE_DMMV)
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
list(APPEND GGML_COMPILE_DEFS K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
if (GGML_CUDA_USE_GRAPHS)
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_USE_GRAPHS)
endif()
if (GGML_CUDA_FORCE_DMMV)
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_DMMV)
endif()
if (LLAMA_CUDA_FORCE_MMQ)
if (GGML_CUDA_FORCE_MMQ)
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_MMQ)
endif()
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
if (LLAMA_CUDA_F16)
if (GGML_CUDA_FORCE_CUBLAS)
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_CUBLAS)
endif()
if (GGML_CUDA_NO_VMM)
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_NO_VMM)
endif()
if (GGML_CUDA_F16)
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_F16)
endif()
list(APPEND GGML_COMPILE_DEFS K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
if (LLAMA_CUDA_NO_PEER_COPY)
if (GGML_CUDA_NO_PEER_COPY)
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_NO_PEER_COPY)
endif()
@ -422,45 +465,34 @@ function(include_ggml SUFFIX)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
endif()
if (LLAMA_CLBLAST)
find_package(CLBlast REQUIRED)
set(GGML_HEADERS_OPENCL ${DIRECTORY}/ggml-opencl.h)
set(GGML_SOURCES_OPENCL ${DIRECTORY}/ggml-opencl.cpp)
list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_CLBLAST)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
endif()
if (LLAMA_VULKAN)
if (GGML_VULKAN)
find_package(Vulkan REQUIRED)
set(GGML_HEADERS_VULKAN ${DIRECTORY}/ggml-vulkan.h)
set(GGML_SOURCES_VULKAN ${DIRECTORY}/ggml-vulkan.cpp)
set(GGML_HEADERS_VULKAN ${DIRECTORY}/ggml/include/ggml-vulkan.h)
set(GGML_SOURCES_VULKAN ${DIRECTORY}/ggml/src/ggml-vulkan.cpp)
list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_VULKAN)
if (LLAMA_VULKAN_CHECK_RESULTS)
if (GGML_VULKAN_CHECK_RESULTS)
list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_CHECK_RESULTS)
endif()
if (LLAMA_VULKAN_DEBUG)
if (GGML_VULKAN_DEBUG)
list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_DEBUG)
endif()
if (LLAMA_VULKAN_VALIDATE)
if (GGML_VULKAN_VALIDATE)
list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_VALIDATE)
endif()
if (LLAMA_VULKAN_RUN_TESTS)
if (GGML_VULKAN_RUN_TESTS)
list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_RUN_TESTS)
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} Vulkan::Vulkan)
endif()
if (LLAMA_HIPBLAS)
if (GGML_HIPBLAS)
if ($ENV{ROCM_PATH})
set(ROCM_PATH $ENV{ROCM_PATH})
else()
@ -490,32 +522,32 @@ function(include_ggml SUFFIX)
message(STATUS "HIP and hipBLAS found")
set(GGML_HEADERS_ROCM ${DIRECTORY}/ggml-cuda.h)
set(GGML_HEADERS_ROCM ${DIRECTORY}/ggml/include/ggml-cuda.h)
file(GLOB GGML_SOURCES_ROCM "${DIRECTORY}/ggml-rocm/*.cu")
list(APPEND GGML_SOURCES_ROCM "${DIRECTORY}/ggml-rocm.cu")
file(GLOB GGML_SOURCES_ROCM "${DIRECTORY}/ggml/src/ggml-rocm/*.cu")
list(APPEND GGML_SOURCES_ROCM "${DIRECTORY}/ggml/src/ggml-rocm.cu")
list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_HIPBLAS GGML_USE_CUDA)
if (LLAMA_HIP_UMA)
if (GGML_HIP_UMA)
list(APPEND GGML_COMPILE_DEFS GGML_HIP_UMA)
endif()
if (LLAMA_CUDA_FORCE_DMMV)
if (GGML_CUDA_FORCE_DMMV)
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_DMMV)
endif()
if (LLAMA_CUDA_FORCE_MMQ)
if (GGML_CUDA_FORCE_MMQ)
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_MMQ)
endif()
if (LLAMA_CUDA_NO_PEER_COPY)
if (GGML_CUDA_NO_PEER_COPY)
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_NO_PEER_COPY)
endif()
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
list(APPEND GGML_COMPILE_DEFS K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
list(APPEND GGML_COMPILE_DEFS GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
list(APPEND GGML_COMPILE_DEFS K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
if (CXX_IS_HIPCC)
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
@ -533,9 +565,9 @@ function(include_ggml SUFFIX)
set(LLAMA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY})
if (LLAMA_KOMPUTE AND NOT GGML_KOMPUTE_ONCE)
if (GGML_KOMPUTE AND NOT GGML_KOMPUTE_ONCE)
set(GGML_KOMPUTE_ONCE ON PARENT_SCOPE)
if (NOT EXISTS "${LLAMA_DIR}/kompute/CMakeLists.txt")
if (NOT EXISTS "${LLAMA_DIR}/ggml/src/kompute/CMakeLists.txt")
message(FATAL_ERROR "Kompute not found")
endif()
message(STATUS "Kompute found")
@ -559,12 +591,12 @@ function(include_ggml SUFFIX)
set(spv_file ${CMAKE_CURRENT_BINARY_DIR}/${OP_FILE}.spv)
add_custom_command(
OUTPUT ${spv_file}
DEPENDS ${LLAMA_DIR}/${source}
${LLAMA_DIR}/kompute-shaders/common.comp
${LLAMA_DIR}/kompute-shaders/op_getrows.comp
${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n.comp
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/${source}
DEPENDS ${LLAMA_DIR}/ggml/src/kompute-shaders/${source}
${LLAMA_DIR}/ggml/src/kompute-shaders/common.comp
${LLAMA_DIR}/ggml/src/kompute-shaders/op_getrows.comp
${LLAMA_DIR}/ggml/src/kompute-shaders/op_mul_mv_q_n_pre.comp
${LLAMA_DIR}/ggml/src/kompute-shaders/op_mul_mv_q_n.comp
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/ggml/src/kompute-shaders/${source}
COMMENT "Compiling ${source} to ${source}.spv"
)
@ -610,39 +642,39 @@ function(include_ggml SUFFIX)
set(KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG "v1.3.239" CACHE STRING "Kompute Vulkan headers tag")
set(KOMPUTE_OPT_LOG_LEVEL Critical CACHE STRING "Kompute log level")
set(FMT_INSTALL OFF)
add_subdirectory(${LLAMA_DIR}/kompute)
add_subdirectory(${LLAMA_DIR}/ggml/src/kompute)
# Compile our shaders
compile_shader(SOURCES
kompute-shaders/op_scale.comp
kompute-shaders/op_scale_8.comp
kompute-shaders/op_add.comp
kompute-shaders/op_addrow.comp
kompute-shaders/op_mul.comp
kompute-shaders/op_silu.comp
kompute-shaders/op_relu.comp
kompute-shaders/op_gelu.comp
kompute-shaders/op_softmax.comp
kompute-shaders/op_norm.comp
kompute-shaders/op_rmsnorm.comp
kompute-shaders/op_diagmask.comp
kompute-shaders/op_mul_mat_mat_f32.comp
kompute-shaders/op_mul_mat_f16.comp
kompute-shaders/op_mul_mat_q8_0.comp
kompute-shaders/op_mul_mat_q4_0.comp
kompute-shaders/op_mul_mat_q4_1.comp
kompute-shaders/op_mul_mat_q6_k.comp
kompute-shaders/op_getrows_f32.comp
kompute-shaders/op_getrows_f16.comp
kompute-shaders/op_getrows_q4_0.comp
kompute-shaders/op_getrows_q4_1.comp
kompute-shaders/op_getrows_q6_k.comp
kompute-shaders/op_rope_f16.comp
kompute-shaders/op_rope_f32.comp
kompute-shaders/op_cpy_f16_f16.comp
kompute-shaders/op_cpy_f16_f32.comp
kompute-shaders/op_cpy_f32_f16.comp
kompute-shaders/op_cpy_f32_f32.comp
op_scale.comp
op_scale_8.comp
op_add.comp
op_addrow.comp
op_mul.comp
op_silu.comp
op_relu.comp
op_gelu.comp
op_softmax.comp
op_norm.comp
op_rmsnorm.comp
op_diagmask.comp
op_mul_mat_mat_f32.comp
op_mul_mat_f16.comp
op_mul_mat_q8_0.comp
op_mul_mat_q4_0.comp
op_mul_mat_q4_1.comp
op_mul_mat_q6_k.comp
op_getrows_f32.comp
op_getrows_f16.comp
op_getrows_q4_0.comp
op_getrows_q4_1.comp
op_getrows_q6_k.comp
op_rope_f16.comp
op_rope_f32.comp
op_cpy_f16_f16.comp
op_cpy_f16_f32.comp
op_cpy_f32_f16.comp
op_cpy_f32_f32.comp
)
# Create a custom target for our generated shaders
@ -687,12 +719,12 @@ function(include_ggml SUFFIX)
)
endif()
if (LLAMA_KOMPUTE)
if (GGML_KOMPUTE)
list(APPEND GGML_COMPILE_DEFS VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
# Add the stamp to the main sources to ensure dependency tracking
set(GGML_SOURCES_KOMPUTE ${LLAMA_DIR}/ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
set(GGML_HEADERS_KOMPUTE ${LLAMA_DIR}/ggml-kompute.h)
set(GGML_SOURCES_KOMPUTE ${LLAMA_DIR}/ggml/src/ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
set(GGML_HEADERS_KOMPUTE ${LLAMA_DIR}/ggml/include/ggml-kompute.h)
list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_KOMPUTE)
@ -701,7 +733,7 @@ function(include_ggml SUFFIX)
set(CUDA_CXX_FLAGS "")
if (LLAMA_CUDA)
if (GGML_CUDA)
set(CUDA_FLAGS -use_fast_math)
if (LLAMA_FATAL_WARNINGS)
@ -748,25 +780,25 @@ function(include_ggml SUFFIX)
endif()
endif()
if (LLAMA_METAL)
if (GGML_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
message(STATUS "Metal framework found")
set(GGML_HEADERS_METAL ${DIRECTORY}/ggml-metal.h)
set(GGML_SOURCES_METAL ${DIRECTORY}/ggml-metal.m)
set(GGML_HEADERS_METAL ${DIRECTORY}/ggml/include/ggml-metal.h)
set(GGML_SOURCES_METAL ${DIRECTORY}/ggml/src/ggml-metal.m)
list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_METAL)
if (LLAMA_METAL_NDEBUG)
if (GGML_METAL_NDEBUG)
list(APPEND GGML_COMPILE_DEFS GGML_METAL_NDEBUG)
endif()
# copy ggml-common.h and ggml-metal.metal to bin directory
configure_file(${DIRECTORY}/ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY)
configure_file(${DIRECTORY}/ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
configure_file(${DIRECTORY}/ggml/src/ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY)
configure_file(${DIRECTORY}/ggml/src/ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
if (LLAMA_METAL_SHADER_DEBUG)
if (GGML_METAL_SHADER_DEBUG)
# custom command to do the following:
# xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
# xcrun -sdk macosx metallib ggml-metal.air -o default.metallib
@ -782,13 +814,13 @@ function(include_ggml SUFFIX)
endif()
# Append macOS metal versioning flags
if (LLAMA_METAL_MACOSX_VERSION_MIN)
message(STATUS "Adding -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN} flag to metal compilation")
list(APPEND XC_FLAGS -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN})
if (GGML_METAL_MACOSX_VERSION_MIN)
message(STATUS "Adding -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN} flag to metal compilation")
list(APPEND XC_FLAGS -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN})
endif()
if (LLAMA_METAL_STD)
message(STATUS "Adding -std=${LLAMA_METAL_STD} flag to metal compilation")
list(APPEND XC_FLAGS -std=${LLAMA_METAL_STD})
if (GGML_METAL_STD)
message(STATUS "Adding -std=${GGML_METAL_STD} flag to metal compilation")
list(APPEND XC_FLAGS -std=${GGML_METAL_STD})
endif()
set(GGML_METALLIB ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib)
@ -799,7 +831,7 @@ function(include_ggml SUFFIX)
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
DEPENDS ${DIRECTORY}/ggml-metal.metal ${DIRECTORY}/ggml-common.h
DEPENDS ${DIRECTORY}/ggml/src/ggml-metal.metal ${DIRECTORY}/ggml/src/ggml-common.h
COMMENT "Compiling Metal kernels"
)
set_source_files_properties(${GGML_METALLIB} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES GENERATED ON)
@ -853,49 +885,49 @@ function(include_ggml SUFFIX)
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
message(STATUS "x86 detected")
if (MSVC)
if (LLAMA_AVX512)
if (GGML_AVX512)
list(APPEND ARCH_FLAGS /arch:AVX512)
# MSVC has no compile-time flags enabling specific
# AVX512 extensions, neither it defines the
# macros corresponding to the extensions.
# Do it manually.
if (LLAMA_AVX512_VBMI)
if (GGML_AVX512_VBMI)
list(APPEND GGML_COMPILE_DEFS $<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
list(APPEND GGML_COMPILE_DEFS $<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
endif()
if (LLAMA_AVX512_VNNI)
if (GGML_AVX512_VNNI)
list(APPEND GGML_COMPILE_DEFS $<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
list(APPEND GGML_COMPILE_DEFS $<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif()
elseif (LLAMA_AVX2)
elseif (GGML_AVX2)
list(APPEND ARCH_FLAGS /arch:AVX2)
elseif (LLAMA_AVX)
elseif (GGML_AVX)
list(APPEND ARCH_FLAGS /arch:AVX)
endif()
else()
if (LLAMA_NATIVE)
if (GGML_NATIVE)
list(APPEND ARCH_FLAGS -march=native)
endif()
if (LLAMA_F16C)
if (GGML_F16C)
list(APPEND ARCH_FLAGS -mf16c)
endif()
if (LLAMA_FMA)
if (GGML_FMA)
list(APPEND ARCH_FLAGS -mfma)
endif()
if (LLAMA_AVX)
if (GGML_AVX)
list(APPEND ARCH_FLAGS -mavx)
endif()
if (LLAMA_AVX2)
if (GGML_AVX2)
list(APPEND ARCH_FLAGS -mavx2)
endif()
if (LLAMA_AVX512)
if (GGML_AVX512)
list(APPEND ARCH_FLAGS -mavx512f)
list(APPEND ARCH_FLAGS -mavx512bw)
endif()
if (LLAMA_AVX512_VBMI)
if (GGML_AVX512_VBMI)
list(APPEND ARCH_FLAGS -mavx512vbmi)
endif()
if (LLAMA_AVX512_VNNI)
if (GGML_AVX512_VNNI)
list(APPEND ARCH_FLAGS -mavx512vnni)
endif()
endif()
@ -914,7 +946,7 @@ function(include_ggml SUFFIX)
list(APPEND GGML_COMPILE_OPTS "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
list(APPEND GGML_COMPILE_OPTS "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
if (LLAMA_CUDA)
if (GGML_CUDA)
list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
@ -926,24 +958,26 @@ function(include_ggml SUFFIX)
# ggml
add_library(ggml${SUFFIX} OBJECT
${DIRECTORY}/ggml.c
${DIRECTORY}/ggml.h
${DIRECTORY}/ggml-alloc.c
${DIRECTORY}/ggml-alloc.h
${DIRECTORY}/ggml-backend.c
${DIRECTORY}/ggml-backend.h
${DIRECTORY}/ggml-quants.c
${DIRECTORY}/ggml-quants.h
${DIRECTORY}/ggml/include/ggml.h
${DIRECTORY}/ggml/include/ggml-alloc.h
${DIRECTORY}/ggml/include/ggml-backend.h
${DIRECTORY}/ggml/src/ggml.c
${DIRECTORY}/ggml/src/ggml-alloc.c
${DIRECTORY}/ggml/src/ggml-backend.c
${DIRECTORY}/ggml/src/ggml-quants.c
${DIRECTORY}/ggml/src/ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
${DIRECTORY}/ggml/src/ggml-aarch64.c
${DIRECTORY}/ggml/src/ggml-aarch64.h
)
target_include_directories(ggml${SUFFIX} PUBLIC ${DIRECTORY} ${LLAMA_EXTRA_INCLUDES})
target_include_directories(ggml${SUFFIX} PUBLIC ${DIRECTORY}/ggml/include ${LLAMA_EXTRA_INCLUDES})
target_include_directories(ggml${SUFFIX} PRIVATE ${DIRECTORY}/ggml/src)
target_compile_features(ggml${SUFFIX} PUBLIC c_std_11) # don't bump
target_link_libraries(ggml${SUFFIX} PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
@ -955,14 +989,15 @@ function(include_ggml SUFFIX)
# llama
add_library(llama${SUFFIX} STATIC
${DIRECTORY}/llama.cpp
${DIRECTORY}/llama.h
${DIRECTORY}/unicode.h
${DIRECTORY}/unicode.cpp
${DIRECTORY}/unicode-data.cpp
${DIRECTORY}/include/llama.h
${DIRECTORY}/src/llama.cpp
${DIRECTORY}/src/unicode.h
${DIRECTORY}/src/unicode.cpp
${DIRECTORY}/src/unicode-data.cpp
)
target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY})
target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY}/include ${DIRECTORY}/ggml/include)
target_include_directories(llama${SUFFIX} PRIVATE ${DIRECTORY}/src)
target_compile_features (llama${SUFFIX} PUBLIC cxx_std_11) # don't bump
target_link_libraries(llama${SUFFIX} PRIVATE

View File

@ -30,9 +30,9 @@
#ifdef GGML_USE_KOMPUTE
# include <ggml-kompute.h>
#elif GGML_USE_VULKAN
#elif defined(GGML_USE_VULKAN)
# include <ggml-vulkan.h>
#elif GGML_USE_CUDA
#elif defined(GGML_USE_CUDA)
# include <ggml-cuda.h>
#endif
@ -51,14 +51,14 @@ static const std::vector<const char *> KNOWN_ARCHES {
// "grok", -- 314B parameters
"gpt2",
// "gptj", -- no inference code
// "gptneox", -- no inference code
"gptneox",
"mpt",
"baichuan",
"starcoder",
// "persimmon", -- CUDA generates garbage
"refact",
"bert",
"nomic-bert",
// "jina-bert-v2", -- Assertion `i01 >= 0 && i01 < ne01' failed.
"bloom",
"stablelm",
"qwen",
@ -72,12 +72,20 @@ static const std::vector<const char *> KNOWN_ARCHES {
"internlm2",
// "minicpm", -- CUDA generates garbage
"gemma",
"gemma2",
"starcoder2",
// "mamba", -- CUDA missing SSM_CONV
"xverse",
"command-r",
// "dbrx", -- 16x12B parameters
"olmo",
"openelm",
// "arctic", -- 10B+128x3.66B parameters
// "deepseek2", -- excessive VRAM requirements
"chatglm",
// "bitnet", -- tensor not within file bounds?
// "t5", -- seq2seq model
"jais",
};
static const std::vector<const char *> EMBEDDING_ARCHES {
@ -103,6 +111,16 @@ static void llama_log_callback(enum ggml_log_level level, const char *text, void
}
}
#ifdef GGML_USE_CUDA
static void cuda_log_callback(enum ggml_log_level level, const char *text, void *userdata)
{
(void)userdata;
if (llama_verbose() || level <= GGML_LOG_LEVEL_WARN) {
fputs(text, stderr);
}
}
#endif
struct gpt_params {
int32_t seed = -1; // RNG seed
int32_t n_keep = 0; // number of tokens to keep from initial prompt
@ -515,9 +533,8 @@ std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::
{
const bool wantBOS = ctx.n_past == 0 && ctx.tokens.empty();
const bool useBOS = wantBOS && shouldAddBOS();
auto strCat = wantBOS && !special ? " " + str : str; // insert leading space ourselves, llama.cpp fork doesn't anymore
std::vector<LLModel::Token> fres(strCat.size()+4);
auto fres_len = llama_tokenize(d_ptr->model, strCat.c_str(), strCat.length(), fres.data(), fres.size(), useBOS, special);
std::vector<LLModel::Token> fres(str.length() + 4);
auto fres_len = llama_tokenize(d_ptr->model, str.c_str(), str.length(), fres.data(), fres.size(), useBOS, special);
fres.resize(fres_len);
return fres;
}
@ -525,10 +542,10 @@ std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::
std::string LLamaModel::tokenToString(Token id) const
{
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), 0, false);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
int check = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), 0, false);
GGML_ASSERT(check == -n_tokens);
}
else {
@ -1170,6 +1187,9 @@ DLL_EXPORT bool is_arch_supported(const char *arch)
DLL_EXPORT LLModel *construct()
{
llama_log_set(llama_log_callback, nullptr);
#ifdef GGML_USE_CUDA
ggml_backend_cuda_log_set_callback(cuda_log_callback, nullptr);
#endif
return new LLamaModel;
}
}