2023-04-25 19:16:45 -04:00
|
|
|
#ifndef LLMODEL_C_H
|
|
|
|
#define LLMODEL_C_H
|
|
|
|
|
|
|
|
#include <stdbool.h>
|
2024-05-31 16:34:54 -04:00
|
|
|
#include <stddef.h>
|
|
|
|
#include <stdint.h>
|
2023-04-25 19:16:45 -04:00
|
|
|
|
2023-05-31 17:04:01 -04:00
|
|
|
#ifdef __GNUC__
|
|
|
|
#define DEPRECATED __attribute__ ((deprecated))
|
|
|
|
#elif defined(_MSC_VER)
|
|
|
|
#define DEPRECATED __declspec(deprecated)
|
|
|
|
#else
|
|
|
|
#pragma message("WARNING: You need to implement DEPRECATED for this compiler")
|
|
|
|
#define DEPRECATED
|
|
|
|
#endif
|
|
|
|
|
2023-04-25 19:16:45 -04:00
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/**
|
2023-04-25 21:06:45 -04:00
|
|
|
* Opaque pointer to the underlying model.
|
2023-04-25 19:16:45 -04:00
|
|
|
*/
|
2023-04-25 21:03:10 -04:00
|
|
|
typedef void *llmodel_model;
|
2023-04-25 19:16:45 -04:00
|
|
|
|
|
|
|
/**
|
2023-04-25 21:15:38 -04:00
|
|
|
* llmodel_prompt_context structure for holding the prompt context.
|
2023-04-27 09:43:24 -04:00
|
|
|
* NOTE: The implementation takes care of all the memory handling of the raw logits pointer and the
|
|
|
|
* raw tokens pointer. Attempting to resize them or modify them in any way can lead to undefined
|
|
|
|
* behavior.
|
2023-04-25 19:16:45 -04:00
|
|
|
*/
|
2023-05-31 17:04:01 -04:00
|
|
|
struct llmodel_prompt_context {
|
2023-04-25 19:16:45 -04:00
|
|
|
int32_t *tokens; // current tokens in the context window
|
2023-04-27 09:43:24 -04:00
|
|
|
size_t tokens_size; // the size of the raw tokens vector
|
2023-04-25 19:16:45 -04:00
|
|
|
int32_t n_past; // number of tokens in past conversation
|
|
|
|
int32_t n_ctx; // number of tokens possible in context window
|
|
|
|
int32_t n_predict; // number of tokens to predict
|
|
|
|
int32_t top_k; // top k logits to sample from
|
|
|
|
float top_p; // nucleus sampling probability threshold
|
2024-02-24 17:51:34 -05:00
|
|
|
float min_p; // Min P sampling
|
2023-04-25 19:16:45 -04:00
|
|
|
float temp; // temperature to adjust model's output distribution
|
|
|
|
int32_t n_batch; // number of predictions to generate in parallel
|
|
|
|
float repeat_penalty; // penalty factor for repeated tokens
|
|
|
|
int32_t repeat_last_n; // last n tokens to penalize
|
2023-04-25 21:17:00 -04:00
|
|
|
float context_erase; // percent of context to erase if we exceed the context window
|
2023-05-31 17:04:01 -04:00
|
|
|
};
|
2023-08-30 09:43:56 -04:00
|
|
|
|
|
|
|
struct llmodel_gpu_device {
|
2024-05-15 15:27:50 -04:00
|
|
|
const char * backend;
|
2024-04-04 14:52:13 -04:00
|
|
|
int index;
|
|
|
|
int type; // same as VkPhysicalDeviceType
|
|
|
|
size_t heapSize;
|
2023-08-30 09:43:56 -04:00
|
|
|
const char * name;
|
|
|
|
const char * vendor;
|
|
|
|
};
|
|
|
|
|
2023-05-31 17:04:01 -04:00
|
|
|
#ifndef __cplusplus
|
|
|
|
typedef struct llmodel_prompt_context llmodel_prompt_context;
|
2023-08-30 09:43:56 -04:00
|
|
|
typedef struct llmodel_gpu_device llmodel_gpu_device;
|
2023-05-31 17:04:01 -04:00
|
|
|
#endif
|
2023-04-25 19:16:45 -04:00
|
|
|
|
2023-04-27 11:08:15 -04:00
|
|
|
/**
|
|
|
|
* Callback type for prompt processing.
|
|
|
|
* @param token_id The token id of the prompt.
|
|
|
|
* @return a bool indicating whether the model should keep processing.
|
|
|
|
*/
|
|
|
|
typedef bool (*llmodel_prompt_callback)(int32_t token_id);
|
|
|
|
|
2023-04-25 19:16:45 -04:00
|
|
|
/**
|
2023-04-25 21:03:10 -04:00
|
|
|
* Callback type for response.
|
|
|
|
* @param token_id The token id of the response.
|
2023-04-27 11:08:15 -04:00
|
|
|
* @param response The response string. NOTE: a token_id of -1 indicates the string is an error string.
|
2023-04-25 21:03:10 -04:00
|
|
|
* @return a bool indicating whether the model should keep generating.
|
2023-04-25 19:16:45 -04:00
|
|
|
*/
|
2023-04-25 21:03:10 -04:00
|
|
|
typedef bool (*llmodel_response_callback)(int32_t token_id, const char *response);
|
|
|
|
|
2024-04-12 16:00:39 -04:00
|
|
|
/**
|
|
|
|
* Embedding cancellation callback for use with llmodel_embed.
|
|
|
|
* @param batch_sizes The number of tokens in each batch that will be embedded.
|
|
|
|
* @param n_batch The number of batches that will be embedded.
|
2024-05-15 15:27:50 -04:00
|
|
|
* @param backend The backend that will be used for embedding. One of "cpu", "kompute", "cuda", or "metal".
|
2024-04-12 16:00:39 -04:00
|
|
|
* @return True to cancel llmodel_embed, false to continue.
|
|
|
|
*/
|
|
|
|
typedef bool (*llmodel_emb_cancel_callback)(unsigned *batch_sizes, unsigned n_batch, const char *backend);
|
|
|
|
|
2023-04-25 19:16:45 -04:00
|
|
|
/**
|
2023-05-31 17:04:01 -04:00
|
|
|
* Create a llmodel instance.
|
|
|
|
* Recognises correct model type from file at model_path
|
|
|
|
* @param model_path A string representing the path to the model file.
|
|
|
|
* @return A pointer to the llmodel_model instance; NULL on error.
|
2023-04-25 19:16:45 -04:00
|
|
|
*/
|
2023-05-31 17:04:01 -04:00
|
|
|
DEPRECATED llmodel_model llmodel_model_create(const char *model_path);
|
2023-04-25 19:16:45 -04:00
|
|
|
|
2023-05-16 11:36:46 -04:00
|
|
|
/**
|
|
|
|
* Create a llmodel instance.
|
|
|
|
* Recognises correct model type from file at model_path
|
2023-05-31 17:04:01 -04:00
|
|
|
* @param model_path A string representing the path to the model file; will only be used to detect model type.
|
2024-05-15 15:27:50 -04:00
|
|
|
* @param backend A string representing the implementation to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
|
2023-11-07 11:20:14 -05:00
|
|
|
* @param error A pointer to a string; will only be set on error.
|
2023-05-31 17:04:01 -04:00
|
|
|
* @return A pointer to the llmodel_model instance; NULL on error.
|
2023-05-16 11:36:46 -04:00
|
|
|
*/
|
2024-05-15 15:27:50 -04:00
|
|
|
llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error);
|
2023-05-16 11:36:46 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Destroy a llmodel instance.
|
|
|
|
* Recognises correct model type using type info
|
|
|
|
* @param model a pointer to a llmodel_model instance.
|
|
|
|
*/
|
|
|
|
void llmodel_model_destroy(llmodel_model model);
|
|
|
|
|
2023-06-26 15:17:34 -04:00
|
|
|
/**
|
|
|
|
* Estimate RAM requirement for a model file
|
|
|
|
* @param model A pointer to the llmodel_model instance.
|
|
|
|
* @param model_path A string representing the path to the model file.
|
2023-12-16 17:58:15 -05:00
|
|
|
* @param n_ctx Maximum size of context window
|
2024-01-31 14:17:44 -05:00
|
|
|
* @param ngl Number of GPU layers to use (Vulkan)
|
2023-06-26 15:17:34 -04:00
|
|
|
* @return size greater than 0 if the model was parsed successfully, 0 if file could not be parsed.
|
|
|
|
*/
|
2024-01-31 14:17:44 -05:00
|
|
|
size_t llmodel_required_mem(llmodel_model model, const char *model_path, int n_ctx, int ngl);
|
2023-06-26 15:17:34 -04:00
|
|
|
|
2023-04-25 19:16:45 -04:00
|
|
|
/**
|
|
|
|
* Load a model from a file.
|
2023-04-25 21:14:18 -04:00
|
|
|
* @param model A pointer to the llmodel_model instance.
|
|
|
|
* @param model_path A string representing the path to the model file.
|
2023-12-16 17:58:15 -05:00
|
|
|
* @param n_ctx Maximum size of context window
|
2024-01-31 14:17:44 -05:00
|
|
|
* @param ngl Number of GPU layers to use (Vulkan)
|
2023-04-25 19:16:45 -04:00
|
|
|
* @return true if the model was loaded successfully, false otherwise.
|
|
|
|
*/
|
2024-01-31 14:17:44 -05:00
|
|
|
bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx, int ngl);
|
2023-04-25 19:16:45 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Check if a model is loaded.
|
2023-04-25 21:14:18 -04:00
|
|
|
* @param model A pointer to the llmodel_model instance.
|
2023-04-25 19:16:45 -04:00
|
|
|
* @return true if the model is loaded, false otherwise.
|
|
|
|
*/
|
2023-04-25 21:03:10 -04:00
|
|
|
bool llmodel_isModelLoaded(llmodel_model model);
|
2023-04-25 19:16:45 -04:00
|
|
|
|
2023-05-04 15:31:41 -04:00
|
|
|
/**
|
|
|
|
* Get the size of the internal state of the model.
|
|
|
|
* NOTE: This state data is specific to the type of model you have created.
|
|
|
|
* @param model A pointer to the llmodel_model instance.
|
|
|
|
* @return the size in bytes of the internal state of the model
|
|
|
|
*/
|
|
|
|
uint64_t llmodel_get_state_size(llmodel_model model);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Saves the internal state of the model to the specified destination address.
|
|
|
|
* NOTE: This state data is specific to the type of model you have created.
|
|
|
|
* @param model A pointer to the llmodel_model instance.
|
|
|
|
* @param dest A pointer to the destination.
|
|
|
|
* @return the number of bytes copied
|
|
|
|
*/
|
|
|
|
uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Restores the internal state of the model using data from the specified address.
|
|
|
|
* NOTE: This state data is specific to the type of model you have created.
|
|
|
|
* @param model A pointer to the llmodel_model instance.
|
|
|
|
* @param src A pointer to the src.
|
|
|
|
* @return the number of bytes read
|
|
|
|
*/
|
|
|
|
uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src);
|
|
|
|
|
2023-04-25 19:16:45 -04:00
|
|
|
/**
|
|
|
|
* Generate a response using the model.
|
2023-04-25 21:14:18 -04:00
|
|
|
* @param model A pointer to the llmodel_model instance.
|
2023-04-25 19:16:45 -04:00
|
|
|
* @param prompt A string representing the input prompt.
|
2024-02-21 15:45:32 -05:00
|
|
|
* @param prompt_template A string representing the input prompt template.
|
2023-04-27 11:08:15 -04:00
|
|
|
* @param prompt_callback A callback function for handling the processing of prompt.
|
|
|
|
* @param response_callback A callback function for handling the generated response.
|
2024-08-07 11:25:24 -04:00
|
|
|
* @param allow_context_shift Whether to allow shifting of context to make room for more input.
|
2024-02-21 15:45:32 -05:00
|
|
|
* @param special True if special tokens in the prompt should be processed, false otherwise.
|
2024-03-06 13:32:24 -05:00
|
|
|
* @param fake_reply A string to insert into context as the model's reply, or NULL to generate one.
|
2023-04-25 21:14:18 -04:00
|
|
|
* @param ctx A pointer to the llmodel_prompt_context structure.
|
2023-04-25 19:16:45 -04:00
|
|
|
*/
|
2023-04-25 21:03:10 -04:00
|
|
|
void llmodel_prompt(llmodel_model model, const char *prompt,
|
2024-02-21 15:45:32 -05:00
|
|
|
const char *prompt_template,
|
2023-05-21 15:43:45 -04:00
|
|
|
llmodel_prompt_callback prompt_callback,
|
2023-04-27 11:08:15 -04:00
|
|
|
llmodel_response_callback response_callback,
|
2024-08-07 11:25:24 -04:00
|
|
|
bool allow_context_shift,
|
2024-02-21 15:45:32 -05:00
|
|
|
llmodel_prompt_context *ctx,
|
2024-03-06 13:32:24 -05:00
|
|
|
bool special,
|
|
|
|
const char *fake_reply);
|
2023-04-25 19:16:45 -04:00
|
|
|
|
2023-07-09 11:32:51 -04:00
|
|
|
/**
|
|
|
|
* Generate an embedding using the model.
|
2023-07-17 16:21:03 -04:00
|
|
|
* NOTE: If given NULL pointers for the model or text, or an empty text, a NULL pointer will be
|
|
|
|
* returned. Bindings should signal an error when NULL is the return value.
|
2023-07-09 11:32:51 -04:00
|
|
|
* @param model A pointer to the llmodel_model instance.
|
2024-03-13 18:09:24 -04:00
|
|
|
* @param texts A pointer to a NULL-terminated array of strings representing the texts to generate an
|
|
|
|
* embedding for.
|
2023-07-09 11:32:51 -04:00
|
|
|
* @param embedding_size A pointer to a size_t type that will be set by the call indicating the length
|
|
|
|
* of the returned floating point array.
|
2024-03-13 18:09:24 -04:00
|
|
|
* @param prefix The model-specific prefix representing the embedding task, without the trailing colon. NULL for no
|
|
|
|
* prefix.
|
|
|
|
* @param dimensionality The embedding dimension, for use with Matryoshka-capable models. Set to -1 to for full-size.
|
2024-03-20 11:24:02 -04:00
|
|
|
* @param token_count Return location for the number of prompt tokens processed, or NULL.
|
2024-03-13 18:09:24 -04:00
|
|
|
* @param do_mean True to average multiple embeddings if the text is longer than the model can accept, False to
|
|
|
|
* truncate.
|
|
|
|
* @param atlas Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens with
|
|
|
|
* long_text_mode="mean" will raise an error. Disabled by default.
|
2024-04-12 16:00:39 -04:00
|
|
|
* @param cancel_cb Cancellation callback, or NULL. See the documentation of llmodel_emb_cancel_callback.
|
2024-03-13 18:09:24 -04:00
|
|
|
* @param error Return location for a malloc()ed string that will be set on error, or NULL.
|
2023-07-09 11:32:51 -04:00
|
|
|
* @return A pointer to an array of floating point values passed to the calling method which then will
|
2024-03-13 18:09:24 -04:00
|
|
|
* be responsible for lifetime of this memory. NULL if an error occurred.
|
2023-07-09 11:32:51 -04:00
|
|
|
*/
|
2024-03-13 18:09:24 -04:00
|
|
|
float *llmodel_embed(llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix,
|
2024-04-12 16:00:39 -04:00
|
|
|
int dimensionality, size_t *token_count, bool do_mean, bool atlas,
|
|
|
|
llmodel_emb_cancel_callback cancel_cb, const char **error);
|
2023-07-09 11:32:51 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Frees the memory allocated by the llmodel_embedding function.
|
|
|
|
* @param ptr A pointer to the embedding as returned from llmodel_embedding.
|
|
|
|
*/
|
|
|
|
void llmodel_free_embedding(float *ptr);
|
|
|
|
|
2023-04-25 19:16:45 -04:00
|
|
|
/**
|
|
|
|
* Set the number of threads to be used by the model.
|
2023-04-25 21:14:18 -04:00
|
|
|
* @param model A pointer to the llmodel_model instance.
|
2023-04-25 19:16:45 -04:00
|
|
|
* @param n_threads The number of threads to be used.
|
|
|
|
*/
|
2023-04-25 21:03:10 -04:00
|
|
|
void llmodel_setThreadCount(llmodel_model model, int32_t n_threads);
|
2023-04-25 19:16:45 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Get the number of threads currently being used by the model.
|
2023-04-25 21:14:18 -04:00
|
|
|
* @param model A pointer to the llmodel_model instance.
|
2023-04-25 19:16:45 -04:00
|
|
|
* @return The number of threads currently being used.
|
|
|
|
*/
|
2023-04-25 21:03:10 -04:00
|
|
|
int32_t llmodel_threadCount(llmodel_model model);
|
2023-04-25 19:16:45 -04:00
|
|
|
|
2023-06-02 10:57:21 -04:00
|
|
|
/**
|
|
|
|
* Set llmodel implementation search path.
|
|
|
|
* Default is "."
|
2023-06-05 09:23:17 -04:00
|
|
|
* @param path The path to the llmodel implementation shared objects. This can be a single path or
|
|
|
|
* a list of paths separated by ';' delimiter.
|
2023-06-02 10:57:21 -04:00
|
|
|
*/
|
|
|
|
void llmodel_set_implementation_search_path(const char *path);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get llmodel implementation search path.
|
|
|
|
* @return The current search path; lifetime ends on next set llmodel_set_implementation_search_path() call.
|
|
|
|
*/
|
|
|
|
const char *llmodel_get_implementation_search_path();
|
|
|
|
|
2023-08-30 09:43:56 -04:00
|
|
|
/**
|
|
|
|
* Get a list of available GPU devices given the memory required.
|
2024-04-04 14:52:13 -04:00
|
|
|
* @param memoryRequired The minimum amount of VRAM, in bytes
|
2023-08-30 09:43:56 -04:00
|
|
|
* @return A pointer to an array of llmodel_gpu_device's whose number is given by num_devices.
|
|
|
|
*/
|
2024-04-04 14:52:13 -04:00
|
|
|
struct llmodel_gpu_device* llmodel_available_gpu_devices(size_t memoryRequired, int* num_devices);
|
2023-08-30 09:43:56 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Initializes a GPU device based on a specified string criterion.
|
|
|
|
*
|
|
|
|
* This function initializes a GPU device based on a string identifier provided. The function
|
|
|
|
* allows initialization based on general device type ("gpu"), vendor name ("amd", "nvidia", "intel"),
|
|
|
|
* or any specific device name.
|
|
|
|
*
|
|
|
|
* @param memoryRequired The amount of memory (in bytes) required by the application or task
|
|
|
|
* that will utilize the GPU device.
|
|
|
|
* @param device A string specifying the desired criterion for GPU device selection. It can be:
|
|
|
|
* - "gpu": To initialize the best available GPU.
|
|
|
|
* - "amd", "nvidia", or "intel": To initialize the best available GPU from that vendor.
|
|
|
|
* - A specific GPU device name: To initialize a GPU with that exact name.
|
|
|
|
*
|
|
|
|
* @return True if the GPU device is successfully initialized based on the provided string
|
|
|
|
* criterion. Returns false if the desired GPU device could not be initialized.
|
|
|
|
*/
|
|
|
|
bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Initializes a GPU device by specifying a valid gpu device pointer.
|
|
|
|
* @param device A gpu device pointer.
|
|
|
|
* @return True if the GPU device is successfully initialized, false otherwise.
|
|
|
|
*/
|
|
|
|
bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Initializes a GPU device by its index.
|
|
|
|
* @param device An integer representing the index of the GPU device to be initialized.
|
|
|
|
* @return True if the GPU device is successfully initialized, false otherwise.
|
|
|
|
*/
|
|
|
|
bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device);
|
|
|
|
|
2024-04-18 14:52:02 -04:00
|
|
|
/**
|
|
|
|
* @return The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal".
|
|
|
|
*/
|
|
|
|
const char *llmodel_model_backend_name(llmodel_model model);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @return The name of the GPU device currently in use, or NULL for backends other than Kompute.
|
|
|
|
*/
|
|
|
|
const char *llmodel_model_gpu_device_name(llmodel_model model);
|
|
|
|
|
2023-04-25 19:16:45 -04:00
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif // LLMODEL_C_H
|