diff --git a/docs/llama.cpp-models.md b/docs/llama.cpp-models.md index 7c1553a2..57fbf613 100644 --- a/docs/llama.cpp-models.md +++ b/docs/llama.cpp-models.md @@ -1,23 +1,12 @@ ## Using llama.cpp in the web UI -1. Re-install the requirements.txt: +#### Pre-converted models -``` -pip install -r requirements.txt -U -``` +Simply place the model in the `models` folder, making sure that its name contains `ggml` somewhere and ends in `.bin`. -2. Follow the instructions in the llama.cpp README to generate the `ggml-model-q4_0.bin` file: https://github.com/ggerganov/llama.cpp#usage +#### Convert LLaMA yourself -3. Create a folder inside `models/` for your model and put `ggml-model-q4_0.bin` in it. For instance, `models/llamacpp-7b/ggml-model-q4_0.bin`. - -4. Start the web UI normally: - -``` -python server.py --model llamacpp-7b -``` - -* This procedure should work for any `ggml*.bin` file. Just put it in a folder, and use the name of this folder as the argument after `--model` or as the model loaded inside the interface. -* You can change the number of threads with `--threads N`. +Follow the instructions in the llama.cpp README to generate the `ggml-model-q4_0.bin` file: https://github.com/ggerganov/llama.cpp#usage ## Performance @@ -25,11 +14,4 @@ This was the performance of llama-7b int4 on my i5-12400F: > Output generated in 33.07 seconds (6.05 tokens/s, 200 tokens, context 17) -## Limitations - -~* The parameter sliders in the interface (temperature, top_p, top_k, etc) are completely ignored. So only the default parameters in llama.cpp can be used.~ - -~* Only 512 tokens of context can be used.~ - -~Both of these should be improved soon when llamacpp-python receives an update.~ - +You can change the number of threads with `--threads N`. diff --git a/modules/models.py b/modules/models.py index 800d0be2..ca014d79 100644 --- a/modules/models.py +++ b/modules/models.py @@ -38,13 +38,30 @@ if shared.args.deepspeed: dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration +def find_model_type(model_name): + model_name = model_name.lower() + if 'rwkv-' in model_name.lower(): + return 'rwkv' + elif len(list(Path(f'{shared.args.model_dir}/{model_name}').glob('*ggml*.bin'))) > 0: + return 'llamacpp' + elif re.match('.*ggml.*\.bin', model_name): + return 'llamacpp' + elif 'chatglm' in model_name: + return 'chatglm' + elif 'galactica' in model_name: + return 'galactica' + elif any((k in model_name for k in ['gpt4chan', 'gpt-4chan'])): + return 'gpt4chan' + else: + return 'HF_generic' + + def load_model(model_name): print(f"Loading {model_name}...") t0 = time.time() - shared.is_RWKV = 'rwkv-' in model_name.lower() - shared.is_llamacpp = len(list(Path(f'{shared.args.model_dir}/{model_name}').glob('ggml*.bin'))) > 0 - if 'chatglm' in model_name.lower(): + shared.model_type = find_model_type(model_name) + if shared.model_type == 'chatglm': LoaderClass = AutoModel trust_remote_code = shared.args.trust_remote_code else: @@ -52,7 +69,7 @@ def load_model(model_name): trust_remote_code = False # Load the model in simple 16-bit mode by default - if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV, shared.is_llamacpp]): + if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.model_type in ['rwkv', 'llamacpp']]): model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16, trust_remote_code=trust_remote_code) if torch.has_mps: device = torch.device('mps') @@ -91,7 +108,7 @@ def load_model(model_name): print(f"DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}") # RMKV model (not on HuggingFace) - elif shared.is_RWKV: + elif shared.model_type == 'rwkv': from modules.RWKV import RWKVModel, RWKVTokenizer model = RWKVModel.from_pretrained(Path(f'{shared.args.model_dir}/{model_name}'), dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16", device="cpu" if shared.args.cpu else "cuda") @@ -100,12 +117,16 @@ def load_model(model_name): return model, tokenizer # llamacpp model - elif shared.is_llamacpp: + elif shared.model_type == 'llamacpp': from modules.llamacpp_model_alternative import LlamaCppModel - model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('ggml*.bin'))[0] - print(f"llama.cpp weights detected: {model_file}\n") + path = Path(f'{shared.args.model_dir}/{model_name}') + if path.is_file(): + model_file = path + else: + model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*ggml*.bin'))[0] + print(f"llama.cpp weights detected: {model_file}\n") model, tokenizer = LlamaCppModel.from_pretrained(model_file) return model, tokenizer @@ -190,7 +211,7 @@ def load_model(model_name): llama_attn_hijack.hijack_llama_attention() # Loading the tokenizer - if any((k in model_name.lower() for k in ['gpt4chan', 'gpt-4chan'])) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists(): + if shared.model_type == 'gpt4chan' and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists(): tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/")) elif type(model) is transformers.LlamaForCausalLM: tokenizer = None diff --git a/modules/shared.py b/modules/shared.py index 41c068db..1517526a 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -6,11 +6,10 @@ import yaml model = None tokenizer = None model_name = "None" +model_type = None lora_names = [] soft_prompt_tensor = None soft_prompt = False -is_RWKV = False -is_llamacpp = False # Chat variables history = {'internal': [], 'visible': []} diff --git a/modules/text_generation.py b/modules/text_generation.py index 370130ed..e1e169a0 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -24,7 +24,7 @@ def get_max_prompt_length(state): def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None): - if any((shared.is_RWKV, shared.is_llamacpp)): + if shared.model_type in ['rwkv', 'llamacpp']: input_ids = shared.tokenizer.encode(str(prompt)) input_ids = np.array(input_ids).reshape(1, len(input_ids)) return input_ids @@ -44,7 +44,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if truncation_length is not None: input_ids = input_ids[:, -truncation_length:] - if any((shared.is_RWKV, shared.is_llamacpp, shared.args.cpu)): + if shared.model_type in ['rwkv', 'llamacpp'] or shared.args.cpu: return input_ids elif shared.args.flexgen: return input_ids.numpy() @@ -97,10 +97,10 @@ def fix_galactica(s): def formatted_outputs(reply, model_name): if not shared.is_chat(): - if 'galactica' in model_name.lower(): + if shared.model_type == 'galactica': reply = fix_galactica(reply) return reply, reply, generate_basic_html(reply) - elif any((k in shared.model_name.lower() for k in ['gpt4chan', 'gpt-4chan'])): + elif shared.model_type == 'gpt4chan': reply = fix_gpt4chan(reply) return reply, 'Only applicable for GALACTICA models.', generate_4chan_html(reply) else: @@ -142,7 +142,7 @@ def generate_reply(question, state, eos_token=None, stopping_strings=[]): # These models are not part of Hugging Face, so we handle them # separately and terminate the function call earlier - if any((shared.is_RWKV, shared.is_llamacpp)): + if shared.model_type in ['rwkv', 'llamacpp']: if shared.args.verbose: print(f'\n\n{question}\n--------------------\n')