mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2024-10-01 01:06:10 -04:00
Mixtral crash fix and python bindings v2.2.0 (#1931)
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
parent
1b524c4617
commit
bf493bb048
@ -713,10 +713,16 @@ bool Bert::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
|||||||
{
|
{
|
||||||
(void)n_ctx;
|
(void)n_ctx;
|
||||||
(void)ngl;
|
(void)ngl;
|
||||||
d_ptr->ctx = bert_load_from_file(modelPath.c_str());
|
d_ptr->modelLoaded = false;
|
||||||
d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
|
||||||
d_ptr->modelLoaded = d_ptr->ctx != nullptr;
|
auto * ctx = bert_load_from_file(modelPath.c_str());
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
if (!ctx)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
d_ptr->ctx = ctx;
|
||||||
|
d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||||
|
d_ptr->modelLoaded = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -685,18 +685,21 @@ size_t GPTJ::requiredMem(const std::string &modelPath, int n_ctx, int ngl) {
|
|||||||
bool GPTJ::loadModel(const std::string &modelPath, int n_ctx, int ngl) {
|
bool GPTJ::loadModel(const std::string &modelPath, int n_ctx, int ngl) {
|
||||||
(void)n_ctx;
|
(void)n_ctx;
|
||||||
(void)ngl;
|
(void)ngl;
|
||||||
|
d_ptr->modelLoaded = false;
|
||||||
|
|
||||||
std::mt19937 rng(time(NULL));
|
std::mt19937 rng(time(NULL));
|
||||||
d_ptr->rng = rng;
|
d_ptr->rng = rng;
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
if (!gptj_model_load(modelPath, *d_ptr->model, d_ptr->vocab)) {
|
bool ok = gptj_model_load(modelPath, *d_ptr->model, d_ptr->vocab);
|
||||||
|
fflush(stdout);
|
||||||
|
if (!ok) {
|
||||||
std::cerr << "GPT-J ERROR: failed to load model from " << modelPath;
|
std::cerr << "GPT-J ERROR: failed to load model from " << modelPath;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||||
d_ptr->modelLoaded = true;
|
d_ptr->modelLoaded = true;
|
||||||
fflush(stdout);
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit cd1b5a104b9d3e211a50b9f6c261aced3bf09834
|
Subproject commit 315102f89109f1b67c8f89f12d98ab646685e333
|
@ -150,6 +150,8 @@ size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
|
|||||||
|
|
||||||
bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
||||||
{
|
{
|
||||||
|
d_ptr->modelLoaded = false;
|
||||||
|
|
||||||
// clean up after previous loadModel()
|
// clean up after previous loadModel()
|
||||||
if (d_ptr->model) {
|
if (d_ptr->model) {
|
||||||
llama_free_model(d_ptr->model);
|
llama_free_model(d_ptr->model);
|
||||||
@ -195,6 +197,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
|||||||
|
|
||||||
d_ptr->model = llama_load_model_from_file_gpt4all(modelPath.c_str(), &d_ptr->model_params);
|
d_ptr->model = llama_load_model_from_file_gpt4all(modelPath.c_str(), &d_ptr->model_params);
|
||||||
if (!d_ptr->model) {
|
if (!d_ptr->model) {
|
||||||
|
fflush(stdout);
|
||||||
d_ptr->device = -1;
|
d_ptr->device = -1;
|
||||||
std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
|
std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
|
||||||
return false;
|
return false;
|
||||||
@ -225,6 +228,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
|||||||
|
|
||||||
d_ptr->ctx = llama_new_context_with_model(d_ptr->model, d_ptr->ctx_params);
|
d_ptr->ctx = llama_new_context_with_model(d_ptr->model, d_ptr->ctx_params);
|
||||||
if (!d_ptr->ctx) {
|
if (!d_ptr->ctx) {
|
||||||
|
fflush(stdout);
|
||||||
std::cerr << "LLAMA ERROR: failed to init context for model " << modelPath << std::endl;
|
std::cerr << "LLAMA ERROR: failed to init context for model " << modelPath << std::endl;
|
||||||
llama_free_model(d_ptr->model);
|
llama_free_model(d_ptr->model);
|
||||||
d_ptr->model = nullptr;
|
d_ptr->model = nullptr;
|
||||||
@ -240,8 +244,8 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
fflush(stdout);
|
||||||
d_ptr->modelLoaded = true;
|
d_ptr->modelLoaded = true;
|
||||||
fflush(stderr);
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,2 +1 @@
|
|||||||
from .gpt4all import Embed4All as Embed4All, GPT4All as GPT4All
|
from .gpt4all import Embed4All as Embed4All, GPT4All as GPT4All
|
||||||
from .pyllmodel import LLModel as LLModel
|
|
||||||
|
@ -142,15 +142,6 @@ def empty_response_callback(token_id: int, response: str) -> bool:
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def _create_model(model_path: bytes) -> ctypes.c_void_p:
|
|
||||||
err = ctypes.c_char_p()
|
|
||||||
model = llmodel.llmodel_model_create2(model_path, b"auto", ctypes.byref(err))
|
|
||||||
if model is None:
|
|
||||||
s = err.value
|
|
||||||
raise ValueError("Unable to instantiate model: {'null' if s is None else s.decode()}")
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
# Symbol to terminate from generator
|
# Symbol to terminate from generator
|
||||||
class Sentinel(Enum):
|
class Sentinel(Enum):
|
||||||
TERMINATING_SYMBOL = 0
|
TERMINATING_SYMBOL = 0
|
||||||
@ -161,116 +152,77 @@ class LLModel:
|
|||||||
Base class and universal wrapper for GPT4All language models
|
Base class and universal wrapper for GPT4All language models
|
||||||
built around llmodel C-API.
|
built around llmodel C-API.
|
||||||
|
|
||||||
Attributes
|
Parameters
|
||||||
----------
|
----------
|
||||||
model: llmodel_model
|
model_path : str
|
||||||
Ctype pointer to underlying model
|
Path to the model.
|
||||||
model_name: str
|
n_ctx : int
|
||||||
Model name
|
Maximum size of context window
|
||||||
|
ngl : int
|
||||||
|
Number of GPU layers to use (Vulkan)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, model_path: str, n_ctx: int, ngl: int):
|
||||||
self.model = None
|
self.model_path = model_path.encode()
|
||||||
self.model_name = None
|
self.n_ctx = n_ctx
|
||||||
self.context = None
|
self.ngl = ngl
|
||||||
self.llmodel_lib = llmodel
|
self.context: LLModelPromptContext | None = None
|
||||||
|
|
||||||
self.buffer = bytearray()
|
self.buffer = bytearray()
|
||||||
self.buff_expecting_cont_bytes: int = 0
|
self.buff_expecting_cont_bytes: int = 0
|
||||||
|
|
||||||
|
# Construct a model implementation
|
||||||
|
err = ctypes.c_char_p()
|
||||||
|
model = llmodel.llmodel_model_create2(self.model_path, b"auto", ctypes.byref(err))
|
||||||
|
if model is None:
|
||||||
|
s = err.value
|
||||||
|
raise ValueError("Unable to instantiate model: {'null' if s is None else s.decode()}")
|
||||||
|
self.model = model
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
if self.model is not None:
|
if hasattr(self, 'model'):
|
||||||
self.llmodel_lib.llmodel_model_destroy(self.model)
|
llmodel.llmodel_model_destroy(self.model)
|
||||||
|
|
||||||
def memory_needed(self, model_path: str, n_ctx: int, ngl: int) -> int:
|
|
||||||
self.model = None
|
|
||||||
return self._memory_needed(model_path, n_ctx, ngl)
|
|
||||||
|
|
||||||
def _memory_needed(self, model_path: str, n_ctx: int, ngl: int) -> int:
|
|
||||||
if self.model is None:
|
|
||||||
self.model = _create_model(model_path.encode())
|
|
||||||
return llmodel.llmodel_required_mem(self.model, model_path.encode(), n_ctx, ngl)
|
|
||||||
|
|
||||||
def list_gpu(self, model_path: str, n_ctx: int, ngl: int) -> list[LLModelGPUDevice]:
|
|
||||||
"""
|
|
||||||
Lists available GPU devices that satisfy the model's memory requirements.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
model_path : str
|
|
||||||
Path to the model.
|
|
||||||
n_ctx : int
|
|
||||||
Maximum size of context window
|
|
||||||
ngl : int
|
|
||||||
Number of GPU layers to use (Vulkan)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
list
|
|
||||||
A list of LLModelGPUDevice structures representing available GPU devices.
|
|
||||||
"""
|
|
||||||
mem_required = self._memory_needed(model_path, n_ctx, ngl)
|
|
||||||
return self._list_gpu(mem_required)
|
|
||||||
|
|
||||||
def _list_gpu(self, mem_required: int) -> list[LLModelGPUDevice]:
|
def _list_gpu(self, mem_required: int) -> list[LLModelGPUDevice]:
|
||||||
num_devices = ctypes.c_int32(0)
|
num_devices = ctypes.c_int32(0)
|
||||||
devices_ptr = self.llmodel_lib.llmodel_available_gpu_devices(self.model, mem_required, ctypes.byref(num_devices))
|
devices_ptr = llmodel.llmodel_available_gpu_devices(self.model, mem_required, ctypes.byref(num_devices))
|
||||||
if not devices_ptr:
|
if not devices_ptr:
|
||||||
raise ValueError("Unable to retrieve available GPU devices")
|
raise ValueError("Unable to retrieve available GPU devices")
|
||||||
return devices_ptr[:num_devices.value]
|
return devices_ptr[:num_devices.value]
|
||||||
|
|
||||||
def init_gpu(self, model_path: str, device: str, n_ctx: int, ngl: int):
|
def init_gpu(self, device: str):
|
||||||
mem_required = self._memory_needed(model_path, n_ctx, ngl)
|
mem_required = llmodel.llmodel_required_mem(self.model, self.model_path, self.n_ctx, self.ngl)
|
||||||
|
|
||||||
success = self.llmodel_lib.llmodel_gpu_init_gpu_device_by_string(self.model, mem_required, device.encode())
|
if llmodel.llmodel_gpu_init_gpu_device_by_string(self.model, mem_required, device.encode()):
|
||||||
if not success:
|
return
|
||||||
# Retrieve all GPUs without considering memory requirements.
|
|
||||||
num_devices = ctypes.c_int32(0)
|
|
||||||
all_devices_ptr = self.llmodel_lib.llmodel_available_gpu_devices(self.model, 0, ctypes.byref(num_devices))
|
|
||||||
if not all_devices_ptr:
|
|
||||||
raise ValueError("Unable to retrieve list of all GPU devices")
|
|
||||||
all_gpus = [d.name.decode() for d in all_devices_ptr[:num_devices.value]]
|
|
||||||
|
|
||||||
# Retrieve GPUs that meet the memory requirements using list_gpu
|
# Retrieve all GPUs without considering memory requirements.
|
||||||
available_gpus = [device.name.decode() for device in self._list_gpu(mem_required)]
|
num_devices = ctypes.c_int32(0)
|
||||||
|
all_devices_ptr = llmodel.llmodel_available_gpu_devices(self.model, 0, ctypes.byref(num_devices))
|
||||||
|
if not all_devices_ptr:
|
||||||
|
raise ValueError("Unable to retrieve list of all GPU devices")
|
||||||
|
all_gpus = [d.name.decode() for d in all_devices_ptr[:num_devices.value]]
|
||||||
|
|
||||||
# Identify GPUs that are unavailable due to insufficient memory or features
|
# Retrieve GPUs that meet the memory requirements using list_gpu
|
||||||
unavailable_gpus = set(all_gpus).difference(available_gpus)
|
available_gpus = [device.name.decode() for device in self._list_gpu(mem_required)]
|
||||||
|
|
||||||
# Formulate the error message
|
# Identify GPUs that are unavailable due to insufficient memory or features
|
||||||
error_msg = "Unable to initialize model on GPU: '{}'.".format(device)
|
unavailable_gpus = set(all_gpus).difference(available_gpus)
|
||||||
error_msg += "\nAvailable GPUs: {}.".format(available_gpus)
|
|
||||||
error_msg += "\nUnavailable GPUs due to insufficient memory or features: {}.".format(unavailable_gpus)
|
|
||||||
raise ValueError(error_msg)
|
|
||||||
|
|
||||||
def load_model(self, model_path: str, n_ctx: int, ngl: int) -> bool:
|
# Formulate the error message
|
||||||
|
error_msg = "Unable to initialize model on GPU: '{}'.".format(device)
|
||||||
|
error_msg += "\nAvailable GPUs: {}.".format(available_gpus)
|
||||||
|
error_msg += "\nUnavailable GPUs due to insufficient memory or features: {}.".format(unavailable_gpus)
|
||||||
|
raise ValueError(error_msg)
|
||||||
|
|
||||||
|
def load_model(self) -> bool:
|
||||||
"""
|
"""
|
||||||
Load model from a file.
|
Load model from a file.
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
model_path : str
|
|
||||||
Model filepath
|
|
||||||
n_ctx : int
|
|
||||||
Maximum size of context window
|
|
||||||
ngl : int
|
|
||||||
Number of GPU layers to use (Vulkan)
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
True if model loaded successfully, False otherwise
|
True if model loaded successfully, False otherwise
|
||||||
"""
|
"""
|
||||||
self.model = _create_model(model_path.encode())
|
return llmodel.llmodel_loadModel(self.model, self.model_path, self.n_ctx, self.ngl)
|
||||||
|
|
||||||
llmodel.llmodel_loadModel(self.model, model_path.encode(), n_ctx, ngl)
|
|
||||||
|
|
||||||
filename = os.path.basename(model_path)
|
|
||||||
self.model_name = os.path.splitext(filename)[0]
|
|
||||||
|
|
||||||
if llmodel.llmodel_isModelLoaded(self.model):
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def set_thread_count(self, n_threads):
|
def set_thread_count(self, n_threads):
|
||||||
if not llmodel.llmodel_isModelLoaded(self.model):
|
if not llmodel.llmodel_isModelLoaded(self.model):
|
||||||
@ -295,7 +247,7 @@ class LLModel:
|
|||||||
reset_context: bool = False,
|
reset_context: bool = False,
|
||||||
):
|
):
|
||||||
if self.context is None:
|
if self.context is None:
|
||||||
self.context = LLModelPromptContext(
|
context = LLModelPromptContext(
|
||||||
logits_size=0,
|
logits_size=0,
|
||||||
tokens_size=0,
|
tokens_size=0,
|
||||||
n_past=0,
|
n_past=0,
|
||||||
@ -309,8 +261,11 @@ class LLModel:
|
|||||||
repeat_last_n=repeat_last_n,
|
repeat_last_n=repeat_last_n,
|
||||||
context_erase=context_erase,
|
context_erase=context_erase,
|
||||||
)
|
)
|
||||||
elif reset_context:
|
self.context = context
|
||||||
self.context.n_past = 0
|
else:
|
||||||
|
context = self.context
|
||||||
|
if reset_context:
|
||||||
|
self.context.n_past = 0
|
||||||
|
|
||||||
self.context.n_predict = n_predict
|
self.context.n_predict = n_predict
|
||||||
self.context.top_k = top_k
|
self.context.top_k = top_k
|
@ -15,7 +15,7 @@ from requests.exceptions import ChunkedEncodingError
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from urllib3.exceptions import IncompleteRead, ProtocolError
|
from urllib3.exceptions import IncompleteRead, ProtocolError
|
||||||
|
|
||||||
from . import pyllmodel
|
from . import _pyllmodel
|
||||||
|
|
||||||
# TODO: move to config
|
# TODO: move to config
|
||||||
DEFAULT_MODEL_DIRECTORY = os.path.join(str(Path.home()), ".cache", "gpt4all").replace("\\", "\\\\")
|
DEFAULT_MODEL_DIRECTORY = os.path.join(str(Path.home()), ".cache", "gpt4all").replace("\\", "\\\\")
|
||||||
@ -97,12 +97,12 @@ class GPT4All:
|
|||||||
verbose: If True, print debug messages.
|
verbose: If True, print debug messages.
|
||||||
"""
|
"""
|
||||||
self.model_type = model_type
|
self.model_type = model_type
|
||||||
self.model = pyllmodel.LLModel()
|
|
||||||
# Retrieve model and download if allowed
|
# Retrieve model and download if allowed
|
||||||
self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
|
self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
|
||||||
|
self.model = _pyllmodel.LLModel(self.config["path"], n_ctx, ngl)
|
||||||
if device is not None and device != "cpu":
|
if device is not None and device != "cpu":
|
||||||
self.model.init_gpu(model_path=self.config["path"], device=device, n_ctx=n_ctx, ngl=ngl)
|
self.model.init_gpu(device)
|
||||||
self.model.load_model(self.config["path"], n_ctx, ngl)
|
self.model.load_model()
|
||||||
# Set n_threads
|
# Set n_threads
|
||||||
if n_threads is not None:
|
if n_threads is not None:
|
||||||
self.model.set_thread_count(n_threads)
|
self.model.set_thread_count(n_threads)
|
||||||
@ -292,7 +292,7 @@ class GPT4All:
|
|||||||
n_batch: int = 8,
|
n_batch: int = 8,
|
||||||
n_predict: Optional[int] = None,
|
n_predict: Optional[int] = None,
|
||||||
streaming: bool = False,
|
streaming: bool = False,
|
||||||
callback: pyllmodel.ResponseCallbackType = pyllmodel.empty_response_callback,
|
callback: _pyllmodel.ResponseCallbackType = _pyllmodel.empty_response_callback,
|
||||||
) -> Union[str, Iterable[str]]:
|
) -> Union[str, Iterable[str]]:
|
||||||
"""
|
"""
|
||||||
Generate outputs from any GPT4All model.
|
Generate outputs from any GPT4All model.
|
||||||
@ -350,9 +350,9 @@ class GPT4All:
|
|||||||
output_collector = self.current_chat_session
|
output_collector = self.current_chat_session
|
||||||
|
|
||||||
def _callback_wrapper(
|
def _callback_wrapper(
|
||||||
callback: pyllmodel.ResponseCallbackType,
|
callback: _pyllmodel.ResponseCallbackType,
|
||||||
output_collector: List[MessageType],
|
output_collector: List[MessageType],
|
||||||
) -> pyllmodel.ResponseCallbackType:
|
) -> _pyllmodel.ResponseCallbackType:
|
||||||
def _callback(token_id: int, response: str) -> bool:
|
def _callback(token_id: int, response: str) -> bool:
|
||||||
nonlocal callback, output_collector
|
nonlocal callback, output_collector
|
||||||
|
|
||||||
|
@ -61,7 +61,7 @@ copy_prebuilt_C_lib(SRC_CLIB_DIRECTORY,
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name=package_name,
|
name=package_name,
|
||||||
version="2.1.0",
|
version="2.2.0",
|
||||||
description="Python bindings for GPT4All",
|
description="Python bindings for GPT4All",
|
||||||
author="Nomic and the Open Source Community",
|
author="Nomic and the Open Source Community",
|
||||||
author_email="support@nomic.ai",
|
author_email="support@nomic.ai",
|
||||||
|
Loading…
Reference in New Issue
Block a user