2023-05-15 19:19:55 -04:00
|
|
|
import re
|
2023-06-19 20:31:19 -04:00
|
|
|
from functools import partial
|
2023-08-26 10:15:11 -04:00
|
|
|
from pathlib import Path
|
|
|
|
from typing import Union
|
2023-05-15 19:19:55 -04:00
|
|
|
|
2023-07-19 22:31:19 -04:00
|
|
|
import torch
|
2023-03-19 02:42:10 -04:00
|
|
|
|
2023-08-25 09:53:37 -04:00
|
|
|
from modules import RoPE, shared
|
2023-03-31 13:27:01 -04:00
|
|
|
from modules.callbacks import Iteratorize
|
2023-05-21 21:42:34 -04:00
|
|
|
from modules.logging_colors import logger
|
2023-08-03 19:01:15 -04:00
|
|
|
from modules.text_generation import get_max_prompt_length
|
2023-08-26 10:15:11 -04:00
|
|
|
from modules.utils import is_gguf
|
2023-03-31 13:27:01 -04:00
|
|
|
|
2023-08-03 10:00:36 -04:00
|
|
|
import llama_cpp
|
|
|
|
|
2023-08-26 10:15:11 -04:00
|
|
|
try:
|
|
|
|
import llama_cpp_ggml
|
|
|
|
except:
|
|
|
|
llama_cpp_ggml = llama_cpp
|
|
|
|
|
2023-07-24 10:25:36 -04:00
|
|
|
if torch.cuda.is_available() and not torch.version.hip:
|
|
|
|
try:
|
2023-08-03 10:00:36 -04:00
|
|
|
import llama_cpp_cuda
|
2023-07-24 10:25:36 -04:00
|
|
|
except:
|
2023-08-03 10:00:36 -04:00
|
|
|
llama_cpp_cuda = None
|
2023-08-26 10:15:11 -04:00
|
|
|
try:
|
|
|
|
import llama_cpp_ggml_cuda
|
|
|
|
except:
|
|
|
|
llama_cpp_ggml_cuda = llama_cpp_cuda
|
2023-07-19 22:31:19 -04:00
|
|
|
else:
|
2023-08-03 10:00:36 -04:00
|
|
|
llama_cpp_cuda = None
|
2023-08-26 10:15:11 -04:00
|
|
|
llama_cpp_ggml_cuda = None
|
2023-08-03 10:00:36 -04:00
|
|
|
|
|
|
|
|
2023-08-26 10:15:11 -04:00
|
|
|
def llama_cpp_lib(model_file: Union[str, Path] = None):
|
|
|
|
if model_file is not None:
|
|
|
|
gguf_model = is_gguf(model_file)
|
|
|
|
else:
|
|
|
|
gguf_model = True
|
2023-08-27 01:11:07 -04:00
|
|
|
|
2023-08-03 10:00:36 -04:00
|
|
|
if shared.args.cpu or llama_cpp_cuda is None:
|
2023-08-26 10:15:11 -04:00
|
|
|
return llama_cpp if gguf_model else llama_cpp_ggml
|
2023-08-03 10:00:36 -04:00
|
|
|
else:
|
2023-08-26 10:15:11 -04:00
|
|
|
return llama_cpp_cuda if gguf_model else llama_cpp_ggml_cuda
|
2023-07-19 22:31:19 -04:00
|
|
|
|
2023-03-19 02:42:10 -04:00
|
|
|
|
2023-06-19 20:31:19 -04:00
|
|
|
def ban_eos_logits_processor(eos_token, input_ids, logits):
|
|
|
|
logits[eos_token] = -float('inf')
|
|
|
|
return logits
|
|
|
|
|
|
|
|
|
2023-03-19 02:42:10 -04:00
|
|
|
class LlamaCppModel:
|
|
|
|
def __init__(self):
|
|
|
|
self.initialized = False
|
|
|
|
|
2023-05-21 21:42:34 -04:00
|
|
|
def __del__(self):
|
2023-05-15 18:51:23 -04:00
|
|
|
self.model.__del__()
|
|
|
|
|
2023-03-19 02:42:10 -04:00
|
|
|
@classmethod
|
|
|
|
def from_pretrained(self, path):
|
2023-08-03 10:00:36 -04:00
|
|
|
|
2023-08-27 01:11:07 -04:00
|
|
|
Llama = llama_cpp_lib(path).Llama
|
|
|
|
LlamaCache = llama_cpp_lib(path).LlamaCache
|
2023-08-03 10:00:36 -04:00
|
|
|
|
2023-03-19 02:42:10 -04:00
|
|
|
result = self()
|
2023-05-15 19:19:55 -04:00
|
|
|
cache_capacity = 0
|
|
|
|
if shared.args.cache_capacity is not None:
|
|
|
|
if 'GiB' in shared.args.cache_capacity:
|
|
|
|
cache_capacity = int(re.sub('[a-zA-Z]', '', shared.args.cache_capacity)) * 1000 * 1000 * 1000
|
|
|
|
elif 'MiB' in shared.args.cache_capacity:
|
|
|
|
cache_capacity = int(re.sub('[a-zA-Z]', '', shared.args.cache_capacity)) * 1000 * 1000
|
|
|
|
else:
|
|
|
|
cache_capacity = int(shared.args.cache_capacity)
|
|
|
|
|
2023-05-21 21:42:34 -04:00
|
|
|
logger.info("Cache capacity is " + str(cache_capacity) + " bytes")
|
2023-08-18 11:03:34 -04:00
|
|
|
|
|
|
|
if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '':
|
|
|
|
tensor_split_list = None
|
|
|
|
else:
|
|
|
|
tensor_split_list = [float(x) for x in shared.args.tensor_split.strip().split(",")]
|
|
|
|
|
2023-05-02 17:25:28 -04:00
|
|
|
params = {
|
|
|
|
'model_path': str(path),
|
2023-05-25 09:29:31 -04:00
|
|
|
'n_ctx': shared.args.n_ctx,
|
|
|
|
'seed': int(shared.args.llama_cpp_seed),
|
2023-05-02 17:25:28 -04:00
|
|
|
'n_threads': shared.args.threads or None,
|
|
|
|
'n_batch': shared.args.n_batch,
|
|
|
|
'use_mmap': not shared.args.no_mmap,
|
2023-05-14 21:58:11 -04:00
|
|
|
'use_mlock': shared.args.mlock,
|
2023-08-18 11:03:34 -04:00
|
|
|
'mul_mat_q': shared.args.mul_mat_q,
|
2023-07-12 10:05:13 -04:00
|
|
|
'low_vram': shared.args.low_vram,
|
2023-07-17 21:32:37 -04:00
|
|
|
'n_gpu_layers': shared.args.n_gpu_layers,
|
2023-08-25 09:53:37 -04:00
|
|
|
'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
|
2023-08-18 11:03:34 -04:00
|
|
|
'tensor_split': tensor_split_list,
|
2023-07-17 21:32:37 -04:00
|
|
|
'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
|
2023-05-02 17:25:28 -04:00
|
|
|
}
|
2023-08-27 01:11:07 -04:00
|
|
|
|
|
|
|
if not is_gguf(path):
|
2023-08-26 15:07:46 -04:00
|
|
|
ggml_params = {
|
|
|
|
'n_gqa': shared.args.n_gqa or None,
|
|
|
|
'rms_norm_eps': shared.args.rms_norm_eps or None,
|
|
|
|
}
|
|
|
|
params = params | ggml_params
|
2023-06-06 12:06:05 -04:00
|
|
|
|
2023-06-20 15:18:42 -04:00
|
|
|
result.model = Llama(**params)
|
2023-05-15 19:19:55 -04:00
|
|
|
if cache_capacity > 0:
|
2023-06-20 15:18:42 -04:00
|
|
|
result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))
|
2023-05-02 17:25:28 -04:00
|
|
|
|
|
|
|
# This is ugly, but the model and the tokenizer are the same object in this library.
|
|
|
|
return result, result
|
|
|
|
|
|
|
|
def encode(self, string):
|
|
|
|
if type(string) is str:
|
|
|
|
string = string.encode()
|
2023-06-06 12:06:05 -04:00
|
|
|
|
2023-05-02 17:25:28 -04:00
|
|
|
return self.model.tokenize(string)
|
2023-03-19 02:42:10 -04:00
|
|
|
|
2023-07-07 12:11:30 -04:00
|
|
|
def decode(self, tokens):
|
|
|
|
return self.model.detokenize(tokens)
|
|
|
|
|
2023-06-16 19:35:38 -04:00
|
|
|
def generate(self, prompt, state, callback=None):
|
2023-08-03 10:00:36 -04:00
|
|
|
|
|
|
|
LogitsProcessorList = llama_cpp_lib().LogitsProcessorList
|
|
|
|
|
2023-06-16 19:35:38 -04:00
|
|
|
prompt = prompt if type(prompt) is str else prompt.decode()
|
2023-08-03 19:01:15 -04:00
|
|
|
|
|
|
|
# Handle truncation
|
|
|
|
prompt = self.encode(prompt)
|
|
|
|
prompt = prompt[-get_max_prompt_length(state):]
|
|
|
|
prompt = self.decode(prompt).decode('utf-8')
|
|
|
|
|
2023-05-15 19:19:55 -04:00
|
|
|
completion_chunks = self.model.create_completion(
|
2023-06-16 19:35:38 -04:00
|
|
|
prompt=prompt,
|
|
|
|
max_tokens=state['max_new_tokens'],
|
|
|
|
temperature=state['temperature'],
|
|
|
|
top_p=state['top_p'],
|
|
|
|
top_k=state['top_k'],
|
|
|
|
repeat_penalty=state['repetition_penalty'],
|
2023-06-17 18:08:25 -04:00
|
|
|
tfs_z=state['tfs'],
|
2023-06-16 19:35:38 -04:00
|
|
|
mirostat_mode=int(state['mirostat_mode']),
|
|
|
|
mirostat_tau=state['mirostat_tau'],
|
|
|
|
mirostat_eta=state['mirostat_eta'],
|
2023-06-19 20:31:19 -04:00
|
|
|
stream=True,
|
|
|
|
logits_processor=LogitsProcessorList([
|
|
|
|
partial(ban_eos_logits_processor, self.model.token_eos()),
|
|
|
|
]) if state['ban_eos_token'] else None,
|
2023-05-15 19:19:55 -04:00
|
|
|
)
|
2023-06-06 12:06:05 -04:00
|
|
|
|
2023-05-15 19:19:55 -04:00
|
|
|
output = ""
|
|
|
|
for completion_chunk in completion_chunks:
|
2023-08-18 23:17:27 -04:00
|
|
|
if shared.stop_everything:
|
|
|
|
break
|
2023-05-15 19:19:55 -04:00
|
|
|
text = completion_chunk['choices'][0]['text']
|
2023-05-02 17:25:28 -04:00
|
|
|
output += text
|
|
|
|
if callback:
|
2023-05-15 19:19:55 -04:00
|
|
|
callback(text)
|
2023-06-06 12:06:05 -04:00
|
|
|
|
2023-05-15 19:19:55 -04:00
|
|
|
return output
|
2023-03-19 02:42:10 -04:00
|
|
|
|
2023-06-16 19:35:38 -04:00
|
|
|
def generate_with_streaming(self, *args, **kwargs):
|
|
|
|
with Iteratorize(self.generate, args, kwargs, callback=None) as generator:
|
2023-03-31 13:27:01 -04:00
|
|
|
reply = ''
|
2023-03-19 02:42:10 -04:00
|
|
|
for token in generator:
|
|
|
|
reply += token
|
|
|
|
yield reply
|