mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-10-01 01:26:03 -04:00
Don't require llama.cpp models to be placed in subfolders
This commit is contained in:
parent
06b6ff6c2e
commit
fcb594b90e
@ -1,23 +1,12 @@
|
|||||||
## Using llama.cpp in the web UI
|
## Using llama.cpp in the web UI
|
||||||
|
|
||||||
1. Re-install the requirements.txt:
|
#### Pre-converted models
|
||||||
|
|
||||||
```
|
Simply place the model in the `models` folder, making sure that its name contains `ggml` somewhere and ends in `.bin`.
|
||||||
pip install -r requirements.txt -U
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Follow the instructions in the llama.cpp README to generate the `ggml-model-q4_0.bin` file: https://github.com/ggerganov/llama.cpp#usage
|
#### Convert LLaMA yourself
|
||||||
|
|
||||||
3. Create a folder inside `models/` for your model and put `ggml-model-q4_0.bin` in it. For instance, `models/llamacpp-7b/ggml-model-q4_0.bin`.
|
Follow the instructions in the llama.cpp README to generate the `ggml-model-q4_0.bin` file: https://github.com/ggerganov/llama.cpp#usage
|
||||||
|
|
||||||
4. Start the web UI normally:
|
|
||||||
|
|
||||||
```
|
|
||||||
python server.py --model llamacpp-7b
|
|
||||||
```
|
|
||||||
|
|
||||||
* This procedure should work for any `ggml*.bin` file. Just put it in a folder, and use the name of this folder as the argument after `--model` or as the model loaded inside the interface.
|
|
||||||
* You can change the number of threads with `--threads N`.
|
|
||||||
|
|
||||||
## Performance
|
## Performance
|
||||||
|
|
||||||
@ -25,11 +14,4 @@ This was the performance of llama-7b int4 on my i5-12400F:
|
|||||||
|
|
||||||
> Output generated in 33.07 seconds (6.05 tokens/s, 200 tokens, context 17)
|
> Output generated in 33.07 seconds (6.05 tokens/s, 200 tokens, context 17)
|
||||||
|
|
||||||
## Limitations
|
You can change the number of threads with `--threads N`.
|
||||||
|
|
||||||
~* The parameter sliders in the interface (temperature, top_p, top_k, etc) are completely ignored. So only the default parameters in llama.cpp can be used.~
|
|
||||||
|
|
||||||
~* Only 512 tokens of context can be used.~
|
|
||||||
|
|
||||||
~Both of these should be improved soon when llamacpp-python receives an update.~
|
|
||||||
|
|
||||||
|
@ -38,13 +38,30 @@ if shared.args.deepspeed:
|
|||||||
dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration
|
dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration
|
||||||
|
|
||||||
|
|
||||||
|
def find_model_type(model_name):
|
||||||
|
model_name = model_name.lower()
|
||||||
|
if 'rwkv-' in model_name.lower():
|
||||||
|
return 'rwkv'
|
||||||
|
elif len(list(Path(f'{shared.args.model_dir}/{model_name}').glob('*ggml*.bin'))) > 0:
|
||||||
|
return 'llamacpp'
|
||||||
|
elif re.match('.*ggml.*\.bin', model_name):
|
||||||
|
return 'llamacpp'
|
||||||
|
elif 'chatglm' in model_name:
|
||||||
|
return 'chatglm'
|
||||||
|
elif 'galactica' in model_name:
|
||||||
|
return 'galactica'
|
||||||
|
elif any((k in model_name for k in ['gpt4chan', 'gpt-4chan'])):
|
||||||
|
return 'gpt4chan'
|
||||||
|
else:
|
||||||
|
return 'HF_generic'
|
||||||
|
|
||||||
|
|
||||||
def load_model(model_name):
|
def load_model(model_name):
|
||||||
print(f"Loading {model_name}...")
|
print(f"Loading {model_name}...")
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
|
|
||||||
shared.is_RWKV = 'rwkv-' in model_name.lower()
|
shared.model_type = find_model_type(model_name)
|
||||||
shared.is_llamacpp = len(list(Path(f'{shared.args.model_dir}/{model_name}').glob('ggml*.bin'))) > 0
|
if shared.model_type == 'chatglm':
|
||||||
if 'chatglm' in model_name.lower():
|
|
||||||
LoaderClass = AutoModel
|
LoaderClass = AutoModel
|
||||||
trust_remote_code = shared.args.trust_remote_code
|
trust_remote_code = shared.args.trust_remote_code
|
||||||
else:
|
else:
|
||||||
@ -52,7 +69,7 @@ def load_model(model_name):
|
|||||||
trust_remote_code = False
|
trust_remote_code = False
|
||||||
|
|
||||||
# Load the model in simple 16-bit mode by default
|
# Load the model in simple 16-bit mode by default
|
||||||
if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV, shared.is_llamacpp]):
|
if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.model_type in ['rwkv', 'llamacpp']]):
|
||||||
model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16, trust_remote_code=trust_remote_code)
|
model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16, trust_remote_code=trust_remote_code)
|
||||||
if torch.has_mps:
|
if torch.has_mps:
|
||||||
device = torch.device('mps')
|
device = torch.device('mps')
|
||||||
@ -91,7 +108,7 @@ def load_model(model_name):
|
|||||||
print(f"DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}")
|
print(f"DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}")
|
||||||
|
|
||||||
# RMKV model (not on HuggingFace)
|
# RMKV model (not on HuggingFace)
|
||||||
elif shared.is_RWKV:
|
elif shared.model_type == 'rwkv':
|
||||||
from modules.RWKV import RWKVModel, RWKVTokenizer
|
from modules.RWKV import RWKVModel, RWKVTokenizer
|
||||||
|
|
||||||
model = RWKVModel.from_pretrained(Path(f'{shared.args.model_dir}/{model_name}'), dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16", device="cpu" if shared.args.cpu else "cuda")
|
model = RWKVModel.from_pretrained(Path(f'{shared.args.model_dir}/{model_name}'), dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16", device="cpu" if shared.args.cpu else "cuda")
|
||||||
@ -100,12 +117,16 @@ def load_model(model_name):
|
|||||||
return model, tokenizer
|
return model, tokenizer
|
||||||
|
|
||||||
# llamacpp model
|
# llamacpp model
|
||||||
elif shared.is_llamacpp:
|
elif shared.model_type == 'llamacpp':
|
||||||
from modules.llamacpp_model_alternative import LlamaCppModel
|
from modules.llamacpp_model_alternative import LlamaCppModel
|
||||||
|
|
||||||
model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('ggml*.bin'))[0]
|
path = Path(f'{shared.args.model_dir}/{model_name}')
|
||||||
print(f"llama.cpp weights detected: {model_file}\n")
|
if path.is_file():
|
||||||
|
model_file = path
|
||||||
|
else:
|
||||||
|
model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*ggml*.bin'))[0]
|
||||||
|
|
||||||
|
print(f"llama.cpp weights detected: {model_file}\n")
|
||||||
model, tokenizer = LlamaCppModel.from_pretrained(model_file)
|
model, tokenizer = LlamaCppModel.from_pretrained(model_file)
|
||||||
return model, tokenizer
|
return model, tokenizer
|
||||||
|
|
||||||
@ -190,7 +211,7 @@ def load_model(model_name):
|
|||||||
llama_attn_hijack.hijack_llama_attention()
|
llama_attn_hijack.hijack_llama_attention()
|
||||||
|
|
||||||
# Loading the tokenizer
|
# Loading the tokenizer
|
||||||
if any((k in model_name.lower() for k in ['gpt4chan', 'gpt-4chan'])) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
|
if shared.model_type == 'gpt4chan' and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
|
||||||
tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
|
tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
|
||||||
elif type(model) is transformers.LlamaForCausalLM:
|
elif type(model) is transformers.LlamaForCausalLM:
|
||||||
tokenizer = None
|
tokenizer = None
|
||||||
|
@ -6,11 +6,10 @@ import yaml
|
|||||||
model = None
|
model = None
|
||||||
tokenizer = None
|
tokenizer = None
|
||||||
model_name = "None"
|
model_name = "None"
|
||||||
|
model_type = None
|
||||||
lora_names = []
|
lora_names = []
|
||||||
soft_prompt_tensor = None
|
soft_prompt_tensor = None
|
||||||
soft_prompt = False
|
soft_prompt = False
|
||||||
is_RWKV = False
|
|
||||||
is_llamacpp = False
|
|
||||||
|
|
||||||
# Chat variables
|
# Chat variables
|
||||||
history = {'internal': [], 'visible': []}
|
history = {'internal': [], 'visible': []}
|
||||||
|
@ -24,7 +24,7 @@ def get_max_prompt_length(state):
|
|||||||
|
|
||||||
|
|
||||||
def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
|
def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
|
||||||
if any((shared.is_RWKV, shared.is_llamacpp)):
|
if shared.model_type in ['rwkv', 'llamacpp']:
|
||||||
input_ids = shared.tokenizer.encode(str(prompt))
|
input_ids = shared.tokenizer.encode(str(prompt))
|
||||||
input_ids = np.array(input_ids).reshape(1, len(input_ids))
|
input_ids = np.array(input_ids).reshape(1, len(input_ids))
|
||||||
return input_ids
|
return input_ids
|
||||||
@ -44,7 +44,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
|
|||||||
if truncation_length is not None:
|
if truncation_length is not None:
|
||||||
input_ids = input_ids[:, -truncation_length:]
|
input_ids = input_ids[:, -truncation_length:]
|
||||||
|
|
||||||
if any((shared.is_RWKV, shared.is_llamacpp, shared.args.cpu)):
|
if shared.model_type in ['rwkv', 'llamacpp'] or shared.args.cpu:
|
||||||
return input_ids
|
return input_ids
|
||||||
elif shared.args.flexgen:
|
elif shared.args.flexgen:
|
||||||
return input_ids.numpy()
|
return input_ids.numpy()
|
||||||
@ -97,10 +97,10 @@ def fix_galactica(s):
|
|||||||
|
|
||||||
def formatted_outputs(reply, model_name):
|
def formatted_outputs(reply, model_name):
|
||||||
if not shared.is_chat():
|
if not shared.is_chat():
|
||||||
if 'galactica' in model_name.lower():
|
if shared.model_type == 'galactica':
|
||||||
reply = fix_galactica(reply)
|
reply = fix_galactica(reply)
|
||||||
return reply, reply, generate_basic_html(reply)
|
return reply, reply, generate_basic_html(reply)
|
||||||
elif any((k in shared.model_name.lower() for k in ['gpt4chan', 'gpt-4chan'])):
|
elif shared.model_type == 'gpt4chan':
|
||||||
reply = fix_gpt4chan(reply)
|
reply = fix_gpt4chan(reply)
|
||||||
return reply, 'Only applicable for GALACTICA models.', generate_4chan_html(reply)
|
return reply, 'Only applicable for GALACTICA models.', generate_4chan_html(reply)
|
||||||
else:
|
else:
|
||||||
@ -142,7 +142,7 @@ def generate_reply(question, state, eos_token=None, stopping_strings=[]):
|
|||||||
|
|
||||||
# These models are not part of Hugging Face, so we handle them
|
# These models are not part of Hugging Face, so we handle them
|
||||||
# separately and terminate the function call earlier
|
# separately and terminate the function call earlier
|
||||||
if any((shared.is_RWKV, shared.is_llamacpp)):
|
if shared.model_type in ['rwkv', 'llamacpp']:
|
||||||
|
|
||||||
if shared.args.verbose:
|
if shared.args.verbose:
|
||||||
print(f'\n\n{question}\n--------------------\n')
|
print(f'\n\n{question}\n--------------------\n')
|
||||||
|
Loading…
Reference in New Issue
Block a user