diff --git a/README.md b/README.md
index d089d42c..d35ebe04 100644
--- a/README.md
+++ b/README.md
@@ -285,6 +285,7 @@ List of command-line flags
 | `--no_use_cuda_fp16`           | This can make models faster on some systems. |
 | `--desc_act`                   | For models that don't have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig. |
 | `--disable_exllama`            | Disable ExLlama kernel, which can improve inference speed on some systems. |
+| `--disable_exllamav2`          | Disable ExLlamav2 kernel. |
 
 #### GPTQ-for-LLaMa
 
diff --git a/modules/AutoGPTQ_loader.py b/modules/AutoGPTQ_loader.py
index f33803e8..514a6ee5 100644
--- a/modules/AutoGPTQ_loader.py
+++ b/modules/AutoGPTQ_loader.py
@@ -52,6 +52,7 @@ def load_quantized(model_name):
         'quantize_config': quantize_config,
         'use_cuda_fp16': not shared.args.no_use_cuda_fp16,
         'disable_exllama': shared.args.disable_exllama,
+        'disable_exllamav2': shared.args.disable_exllamav2,
     }
 
     logger.info(f"The AutoGPTQ params are: {params}")
diff --git a/modules/loaders.py b/modules/loaders.py
index d1f8343f..c7e7653e 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -25,6 +25,7 @@ loaders_and_params = OrderedDict({
         'rope_freq_base',
         'compress_pos_emb',
         'disable_exllama',
+        'disable_exllamav2',
         'transformers_info'
     ],
     'llama.cpp': [
@@ -94,6 +95,7 @@ loaders_and_params = OrderedDict({
         'groupsize',
         'desc_act',
         'disable_exllama',
+        'disable_exllamav2',
         'gpu_memory',
         'cpu_memory',
         'cpu',
diff --git a/modules/models.py b/modules/models.py
index f77fc941..49e5f818 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -156,7 +156,7 @@ def huggingface_loader(model_name):
             LoaderClass = AutoModelForCausalLM
 
     # Load the model in simple 16-bit mode by default
-    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1, shared.args.disable_exllama]):
+    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1, shared.args.disable_exllama, shared.args.disable_exllamav2]):
         model = LoaderClass.from_pretrained(path_to_model, **params)
         if torch.backends.mps.is_available():
             device = torch.device('mps')
@@ -221,11 +221,16 @@ def huggingface_loader(model_name):
             if shared.args.disk:
                 params['offload_folder'] = shared.args.disk_cache_dir
 
-        if shared.args.disable_exllama:
+        if shared.args.disable_exllama or shared.args.disable_exllamav2:
             try:
-                gptq_config = GPTQConfig(bits=config.quantization_config.get('bits', 4), disable_exllama=True)
+                gptq_config = GPTQConfig(
+                    bits=config.quantization_config.get('bits', 4),
+                    disable_exllama=shared.args.disable_exllama,
+                    disable_exllamav2=shared.args.disable_exllamav2,
+                )
+
                 params['quantization_config'] = gptq_config
-                logger.info('Loading with ExLlama kernel disabled.')
+                logger.info(f'Loading with disable_exllama={shared.args.disable_exllama} and disable_exllamav2={shared.args.disable_exllamav2}.')
             except:
                 exc = traceback.format_exc()
                 logger.error('Failed to disable exllama. Does the config.json for this model contain the necessary quantization info?')
diff --git a/modules/shared.py b/modules/shared.py
index b0888935..adebe62d 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -133,6 +133,7 @@ parser.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton m
 parser.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.')
 parser.add_argument('--desc_act', action='store_true', help='For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
 parser.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.')
+parser.add_argument('--disable_exllamav2', action='store_true', help='Disable ExLlamav2 kernel.')
 
 # GPTQ-for-LLaMa
 parser.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
diff --git a/modules/ui.py b/modules/ui.py
index 45849fe3..8bfc9491 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -70,6 +70,7 @@ def list_model_elements():
         'no_inject_fused_mlp',
         'no_use_cuda_fp16',
         'disable_exllama',
+        'disable_exllamav2',
         'cfg_cache',
         'no_flash_attn',
         'cache_8bit',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 7b6767dc..7242d117 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -125,6 +125,7 @@ def create_ui():
                             shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
                             shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
                             shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.')
+                            shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel.')
                             shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.')
                             shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.')
                             shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')