diff --git a/docs/GPTQ-models-(4-bit-mode).md b/docs/GPTQ-models-(4-bit-mode).md index 679cabee..dd9a82ab 100644 --- a/docs/GPTQ-models-(4-bit-mode).md +++ b/docs/GPTQ-models-(4-bit-mode).md @@ -127,6 +127,8 @@ cd text-generation-webui/repositories git clone https://github.com/johnsmith0031/alpaca_lora_4bit ``` +⚠️ I have tested it with the following commit specifically: `9fe5ab364280325f77da15f3541960960961d144` + 3. Install https://github.com/sterlind/GPTQ-for-LLaMa with this command: ``` diff --git a/modules/LoRA.py b/modules/LoRA.py index ef1e88aa..a4ebe208 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -7,23 +7,24 @@ import modules.shared as shared def add_lora_to_model(lora_names): + shared.lora_names = list(lora_names) prior_set = set(shared.lora_names) added_set = set(lora_names) - prior_set removed_set = prior_set - set(lora_names) - shared.lora_names = list(lora_names) - # Nothing to do = skip. + # If no LoRA needs to be added or removed, exit if len(added_set) == 0 and len(removed_set) == 0: return - # Only adding, and already peft? Do it the easy way. + # Add a LoRA when another LoRA is already present if len(removed_set) == 0 and len(prior_set) > 0: print(f"Adding the LoRA(s) named {added_set} to the model...") for lora in added_set: shared.model.load_adapter(Path(f"{shared.args.lora_dir}/{lora}"), lora) + return - # If removing anything, disable all and re-add. + # If any LoRA needs to be removed, start over if len(removed_set) > 0: shared.model.disable_adapter() @@ -43,8 +44,7 @@ def add_lora_to_model(lora_names): shared.model.load_adapter(Path(f"{shared.args.lora_dir}/{lora}"), lora) if not shared.args.load_in_8bit and not shared.args.cpu: - if not shared.args.monkey_patch: - shared.model.half() + shared.model.half() if not hasattr(shared.model, "hf_device_map"): if torch.has_mps: device = torch.device('mps') diff --git a/modules/monkey_patch_gptq_lora.py b/modules/monkey_patch_gptq_lora.py index 3e591b52..872f7ce3 100644 --- a/modules/monkey_patch_gptq_lora.py +++ b/modules/monkey_patch_gptq_lora.py @@ -6,6 +6,7 @@ from pathlib import Path sys.path.insert(0, str(Path("repositories/alpaca_lora_4bit"))) import autograd_4bit +from amp_wrapper import AMPWrapper from autograd_4bit import (Autograd4bitQuantLinear, load_llama_model_4bit_low_ram) from monkeypatch.peft_tuners_lora_monkey_patch import ( @@ -31,6 +32,10 @@ def load_model_llama(model_name): autograd_4bit.use_new = True autograd_4bit.auto_switch = True + model.half() + wrapper = AMPWrapper(model) + wrapper.apply_generate() + try: tokenizer.eos_token_id = 2 tokenizer.bos_token_id = 1