AutoAWQ: initial support (#3999)

2024-10-01 01:26:03 -04:00 · 2023-10-05 16:19:18 +00:00 · 2023-10-05 16:19:18 +00:00 · cc632c3f33
commit cc632c3f33
parent 3f56151f03
8 changed files with 75 additions and 3 deletions
--- a/models/config.yaml
+++ b/models/config.yaml
@ -174,3 +174,5 @@
  instruction_template: 'Llama-v2'
 .*mistral.*instruct:
  instruction_template: 'Mistral'
+.*AWQ:
+  n_batch: 1
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -129,6 +129,16 @@ loaders_and_params = OrderedDict({
        'model_type',
        'no_mmap',
        'mlock'
+    ],
+    'AutoAWQ': [
+        'cpu_memory',
+        'gpu_memory',
+        'auto_devices',
+        'max_seq_len',
+        'n_batch',
+        'no_inject_fused_attention',
+        'trust_remote_code',
+        'use_fast',
    ]
 })

@ -365,7 +375,40 @@ loaders_samplers = {
        'top_k',
        'repetition_penalty',
        'repetition_penalty_range',
-    }
+    },
+    'AutoAWQ': {
+        'temperature',
+        'top_p',
+        'top_k',
+        'typical_p',
+        'epsilon_cutoff',
+        'eta_cutoff',
+        'tfs',
+        'top_a',
+        'repetition_penalty',
+        'repetition_penalty_range',
+        'encoder_repetition_penalty',
+        'no_repeat_ngram_size',
+        'min_length',
+        'seed',
+        'do_sample',
+        'penalty_alpha',
+        'num_beams',
+        'length_penalty',
+        'early_stopping',
+        'mirostat_mode',
+        'mirostat_tau',
+        'mirostat_eta',
+        'grammar_file_row',
+        'grammar_string',
+        'guidance_scale',
+        'negative_prompt',
+        'ban_eos_token',
+        'custom_token_bans',
+        'add_bos_token',
+        'skip_special_tokens',
+        'auto_max_new_tokens',
+    },
 }

 loaders_model_types = {
--- a/modules/models.py
+++ b/modules/models.py
@ -63,6 +63,7 @@ def load_model(model_name, loader=None):
        'ExLlamav2': ExLlamav2_loader,
        'ExLlamav2_HF': ExLlamav2_HF_loader,
        'ctransformers': ctransformers_loader,
+        'AutoAWQ': AutoAWQ_loader,
    }

    if loader is None:
@ -276,6 +277,24 @@ def ctransformers_loader(model_name):
    model, tokenizer = ctrans.from_pretrained(model_file)
    return model, tokenizer

+def AutoAWQ_loader(model_name):
+   from awq import AutoAWQForCausalLM
+
+   model_dir = Path(f'{shared.args.model_dir}/{model_name}')
+
+   if shared.args.deepspeed:
+       logger.warn("AutoAWQ is incompatible with deepspeed")
+
+   model = AutoAWQForCausalLM.from_quantized(
+       quant_path=model_dir,
+       max_new_tokens=shared.args.max_seq_len,
+       trust_remote_code=shared.args.trust_remote_code,
+       fuse_layers=not shared.args.no_inject_fused_attention,
+       max_memory=get_max_memory_dict(),
+       batch_size=shared.args.n_batch,
+       safetensors=not shared.args.trust_remote_code)
+
+   return model

 def GPTQ_loader(model_name):

--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@ -107,10 +107,14 @@ def infer_loader(model_name, model_settings):
        loader = None
    elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0):
        loader = 'AutoGPTQ'
+    elif (path_to_model / 'quant_config.json').exists():
+        loader = 'AutoAWQ'
    elif len(list(path_to_model.glob('*.gguf'))) > 0:
        loader = 'llama.cpp'
    elif re.match(r'.*\.gguf', model_name.lower()):
        loader = 'llama.cpp'
+    elif re.match(r'.*-awq', model_name.lower()):
+        loader = 'AutoAWQ'
    elif re.match(r'.*rwkv.*\.pth', model_name.lower()):
        loader = 'RWKV'
    elif re.match(r'.*exl2', model_name.lower()):
--- a/modules/shared.py
+++ b/modules/shared.py
@ -232,6 +232,8 @@ def fix_loader_name(name):
        return 'ExLlamav2_HF'
    elif name in ['ctransformers', 'ctranforemrs', 'ctransformer']:
        return 'ctransformers'
+    elif name in ['autoawq', 'awq', 'auto-awq']:
+        return 'AutoAWQ'


 def add_extension(name):
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -99,7 +99,7 @@ def create_ui():

                        with gr.Column():
                            shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
-                            shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Disable if running low on VRAM.')
+                            shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.')
                            shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
                            shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
                            shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
--- a/requirements.txt
+++ b/requirements.txt
@ -48,3 +48,4 @@ https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/text
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu117-py3-none-any.whl
+autoawq==0.1.2
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@ -48,3 +48,4 @@ https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/text
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu117-py3-none-any.whl
+autoawq==0.1.2