text-generation-webui/modules/AutoGPTQ_loader.py

from pathlib import Path

from accelerate.utils import is_xpu_available
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

import modules.shared as shared
from modules.logging_colors import logger
from modules.models import get_max_memory_dict


def load_quantized(model_name):
    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
    pt_path = None

    # Find the model checkpoint
    if shared.args.checkpoint:
        pt_path = Path(shared.args.checkpoint)
    else:
        for ext in ['.safetensors', '.pt', '.bin']:
            found = list(path_to_model.glob(f"*{ext}"))
            if len(found) > 0:
                if len(found) > 1:
                    logger.warning(f'More than one {ext} model has been found. The last one will be selected. It could be wrong.')

                pt_path = found[-1]
                break

    if pt_path is None:
        logger.error("The model could not be loaded because its checkpoint file in .bin/.pt/.safetensors format could not be located.")
        return

    use_safetensors = pt_path.suffix == '.safetensors'
    if not (path_to_model / "quantize_config.json").exists():
        quantize_config = BaseQuantizeConfig(
            bits=bits if (bits := shared.args.wbits) > 0 else 4,
            group_size=gs if (gs := shared.args.groupsize) > 0 else -1,
            desc_act=shared.args.desc_act
        )
    else:
        quantize_config = None

    # Define the params for AutoGPTQForCausalLM.from_quantized
    params = {
        'model_basename': pt_path.stem,
        'device': "xpu:0" if is_xpu_available() else "cuda:0" if not shared.args.cpu else "cpu",
        'use_triton': shared.args.triton,
        'inject_fused_attention': False,
        'inject_fused_mlp': not shared.args.no_inject_fused_mlp,
        'use_safetensors': use_safetensors,
        'trust_remote_code': shared.args.trust_remote_code,
        'max_memory': get_max_memory_dict(),
        'quantize_config': quantize_config,
        'use_cuda_fp16': not shared.args.no_use_cuda_fp16,
        'disable_exllama': shared.args.disable_exllama,
        'disable_exllamav2': shared.args.disable_exllamav2,
    }

    logger.info(f"The AutoGPTQ params are: {params}")
    model = AutoGPTQForCausalLM.from_quantized(path_to_model, **params)

    # These lines fix the multimodal extension when used with AutoGPTQ
    if hasattr(model, 'model'):
        if not hasattr(model, 'dtype'):
            if hasattr(model.model, 'dtype'):
                model.dtype = model.model.dtype

        if hasattr(model.model, 'model') and hasattr(model.model.model, 'embed_tokens'):
            if not hasattr(model, 'embed_tokens'):
                model.embed_tokens = model.model.model.embed_tokens

            if not hasattr(model.model, 'embed_tokens'):
                model.model.embed_tokens = model.model.model.embed_tokens

    return model
Add AutoGPTQ support (basic) (#2132) 2023-05-17 10:12:12 -04:00			`from pathlib import Path`

Fix is_ccl_available & is_xpu_available imports 2023-10-26 23:26:25 -04:00			`from accelerate.utils import is_xpu_available`
Extend AutoGPTQ support for any GPTQ model (#1668) 2023-06-02 00:33:55 -04:00			`from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig`
Add AutoGPTQ support (basic) (#2132) 2023-05-17 10:12:12 -04:00
			`import modules.shared as shared`
Prevent unwanted log messages from modules 2023-05-21 21:42:34 -04:00			`from modules.logging_colors import logger`
Add AutoGPTQ support (basic) (#2132) 2023-05-17 10:12:12 -04:00			`from modules.models import get_max_memory_dict`


			`def load_quantized(model_name):`
			`path_to_model = Path(f'{shared.args.model_dir}/{model_name}')`
			`pt_path = None`

			`# Find the model checkpoint`
Extend AutoGPTQ support for any GPTQ model (#1668) 2023-06-02 00:33:55 -04:00			`if shared.args.checkpoint:`
			`pt_path = Path(shared.args.checkpoint)`
			`else:`
			`for ext in ['.safetensors', '.pt', '.bin']:`
			`found = list(path_to_model.glob(f"*{ext}"))`
			`if len(found) > 0:`
			`if len(found) > 1:`
			`logger.warning(f'More than one {ext} model has been found. The last one will be selected. It could be wrong.')`

			`pt_path = found[-1]`
			`break`
Add various checks to model loading functions 2023-05-17 14:52:23 -04:00
			`if pt_path is None:`
Prevent unwanted log messages from modules 2023-05-21 21:42:34 -04:00			`logger.error("The model could not be loaded because its checkpoint file in .bin/.pt/.safetensors format could not be located.")`
Add various checks to model loading functions 2023-05-17 14:52:23 -04:00			`return`
Add AutoGPTQ support (basic) (#2132) 2023-05-17 10:12:12 -04:00
Extend AutoGPTQ support for any GPTQ model (#1668) 2023-06-02 00:33:55 -04:00			`use_safetensors = pt_path.suffix == '.safetensors'`
			`if not (path_to_model / "quantize_config.json").exists():`
			`quantize_config = BaseQuantizeConfig(`
			`bits=bits if (bits := shared.args.wbits) > 0 else 4,`
			`group_size=gs if (gs := shared.args.groupsize) > 0 else -1,`
			`desc_act=shared.args.desc_act`
			`)`
			`else:`
			`quantize_config = None`

Add AutoGPTQ support (basic) (#2132) 2023-05-17 10:12:12 -04:00			`# Define the params for AutoGPTQForCausalLM.from_quantized`
			`params = {`
			`'model_basename': pt_path.stem,`
Intel Gpu support initialization (#4340) 2023-10-26 22:39:51 -04:00			`'device': "xpu:0" if is_xpu_available() else "cuda:0" if not shared.args.cpu else "cpu",`
Add AutoGPTQ support (basic) (#2132) 2023-05-17 10:12:12 -04:00			`'use_triton': shared.args.triton,`
Backend cleanup (#6025) 2024-05-21 12:32:02 -04:00			`'inject_fused_attention': False,`
AutoGPTQ: Add UI and command line support for disabling fused attention and fused MLP (#2648) 2023-06-15 22:59:54 -04:00			`'inject_fused_mlp': not shared.args.no_inject_fused_mlp,`
Add AutoGPTQ support (basic) (#2132) 2023-05-17 10:12:12 -04:00			`'use_safetensors': use_safetensors,`
Falcon support (trust-remote-code and autogptq checkboxes) (#2367) --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-05-29 09:20:18 -04:00			`'trust_remote_code': shared.args.trust_remote_code,`
Extend AutoGPTQ support for any GPTQ model (#1668) 2023-06-02 00:33:55 -04:00			`'max_memory': get_max_memory_dict(),`
Add --no_use_cuda_fp16 param for AutoGPTQ 2023-06-23 11:22:56 -04:00			`'quantize_config': quantize_config,`
			`'use_cuda_fp16': not shared.args.no_use_cuda_fp16,`
Add the --disable_exllama option for AutoGPTQ 2023-08-12 02:26:58 -04:00			`'disable_exllama': shared.args.disable_exllama,`
AutoGPTQ: Add --disable_exllamav2 flag (Mixtral CPU offloading needs this) 2023-12-15 09:46:13 -05:00			`'disable_exllamav2': shared.args.disable_exllamav2,`
Add AutoGPTQ support (basic) (#2132) 2023-05-17 10:12:12 -04:00			`}`

Extend AutoGPTQ support for any GPTQ model (#1668) 2023-06-02 00:33:55 -04:00			`logger.info(f"The AutoGPTQ params are: {params}")`
Add AutoGPTQ support (basic) (#2132) 2023-05-17 10:12:12 -04:00			`model = AutoGPTQForCausalLM.from_quantized(path_to_model, **params)`
Make llava/minigpt-4 work with AutoGPTQ 2023-06-11 16:52:23 -04:00
			`# These lines fix the multimodal extension when used with AutoGPTQ`
Add some checks to AutoGPTQ loader 2023-06-14 17:44:43 -04:00			`if hasattr(model, 'model'):`
			`if not hasattr(model, 'dtype'):`
			`if hasattr(model.model, 'dtype'):`
			`model.dtype = model.model.dtype`
Make llava/minigpt-4 work with AutoGPTQ 2023-06-11 16:52:23 -04:00
Add some checks to AutoGPTQ loader 2023-06-14 17:44:43 -04:00			`if hasattr(model.model, 'model') and hasattr(model.model.model, 'embed_tokens'):`
			`if not hasattr(model, 'embed_tokens'):`
			`model.embed_tokens = model.model.model.embed_tokens`
Make llava/minigpt-4 work with AutoGPTQ 2023-06-11 16:52:23 -04:00
Add some checks to AutoGPTQ loader 2023-06-14 17:44:43 -04:00			`if not hasattr(model.model, 'embed_tokens'):`
			`model.model.embed_tokens = model.model.model.embed_tokens`
Make llava/minigpt-4 work with AutoGPTQ 2023-06-11 16:52:23 -04:00
Add AutoGPTQ support (basic) (#2132) 2023-05-17 10:12:12 -04:00			`return model`