From bee73cedbd535d8a5392472c402c843b3ed10e27 Mon Sep 17 00:00:00 2001
From: jllllll <3887729+jllllll@users.noreply.github.com>
Date: Wed, 9 Aug 2023 23:42:34 -0500
Subject: [PATCH 1/6] Streamline GPTQ-for-LLaMa support

---
 README.md                |  3 --
 modules/GPTQ_loader.py   | 64 ++++++++++------------------------------
 modules/shared.py        |  3 --
 modules/ui_model_menu.py |  2 +-
 requirements.txt         |  4 +++
 5 files changed, 21 insertions(+), 55 deletions(-)

diff --git a/README.md b/README.md
index 98de6c09..ad2ad1ed 100644
--- a/README.md
+++ b/README.md
@@ -280,9 +280,6 @@ Optionally, you can use the following command-line flags:
 | `--pre_layer PRE_LAYER [PRE_LAYER ...]`  | The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg `--pre_layer 30 60`. |
 | `--checkpoint CHECKPOINT` | The path to the quantized checkpoint file. If not specified, it will be automatically detected. |
 | `--monkey-patch`          | Apply the monkey patch for using LoRAs with quantized models.
-| `--quant_attn`         | (triton) Enable quant attention. |
-| `--warmup_autotune`    | (triton) Enable warmup autotune. |
-| `--fused_mlp`          | (triton) Enable fused mlp. |
 
 #### DeepSpeed
 
diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py
index ddc5f9a5..c0cef476 100644
--- a/modules/GPTQ_loader.py
+++ b/modules/GPTQ_loader.py
@@ -11,26 +11,9 @@ from transformers import AutoConfig, AutoModelForCausalLM
 import modules.shared as shared
 from modules.logging_colors import logger
 
-sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
-
-try:
-    import llama_inference_offload
-except ImportError:
-    logger.error('Failed to load GPTQ-for-LLaMa')
-    logger.error('See https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md')
-    sys.exit(-1)
-
-try:
-    from modelutils import find_layers
-except ImportError:
-    from utils import find_layers
-
-try:
-    from quant import make_quant
-    is_triton = False
-except ImportError:
-    import quant
-    is_triton = True
+from gptq_for_llama import llama_inference_offload
+from gptq_for_llama.modelutils import find_layers
+from gptq_for_llama.quant import make_quant
 
 
 # This function is a replacement for the load_quant function in the
@@ -59,24 +42,21 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc
         if name in layers:
             del layers[name]
 
-    if not is_triton:
-        gptq_args = inspect.getfullargspec(make_quant).args
+    gptq_args = inspect.getfullargspec(make_quant).args
 
-        make_quant_kwargs = {
-            'module': model,
-            'names': layers,
-            'bits': wbits,
-        }
-        if 'groupsize' in gptq_args:
-            make_quant_kwargs['groupsize'] = groupsize
-        if 'faster' in gptq_args:
-            make_quant_kwargs['faster'] = faster_kernel
-        if 'kernel_switch_threshold' in gptq_args:
-            make_quant_kwargs['kernel_switch_threshold'] = kernel_switch_threshold
+    make_quant_kwargs = {
+        'module': model,
+        'names': layers,
+        'bits': wbits,
+    }
+    if 'groupsize' in gptq_args:
+        make_quant_kwargs['groupsize'] = groupsize
+    if 'faster' in gptq_args:
+        make_quant_kwargs['faster'] = faster_kernel
+    if 'kernel_switch_threshold' in gptq_args:
+        make_quant_kwargs['kernel_switch_threshold'] = kernel_switch_threshold
 
-        make_quant(**make_quant_kwargs)
-    else:
-        quant.make_quant_linear(model, layers, wbits, groupsize)
+    make_quant(**make_quant_kwargs)
 
     del layers
     if checkpoint.endswith('.safetensors'):
@@ -85,18 +65,6 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc
     else:
         model.load_state_dict(torch.load(checkpoint), strict=False)
 
-    if is_triton:
-        if shared.args.quant_attn:
-            quant.make_quant_attn(model)
-
-        if eval and shared.args.fused_mlp:
-            quant.make_fused_mlp(model)
-
-        if shared.args.warmup_autotune:
-            quant.autotune_warmup_linear(model, transpose=not eval)
-            if eval and shared.args.fused_mlp:
-                quant.autotune_warmup_fused(model)
-
     model.seqlen = 2048
     return model
 
diff --git a/modules/shared.py b/modules/shared.py
index 951120c8..224fa6aa 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -138,9 +138,6 @@ parser.add_argument('--groupsize', type=int, default=-1, help='Group size.')
 parser.add_argument('--pre_layer', type=int, nargs="+", help='The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg --pre_layer 30 60.')
 parser.add_argument('--checkpoint', type=str, help='The path to the quantized checkpoint file. If not specified, it will be automatically detected.')
 parser.add_argument('--monkey-patch', action='store_true', help='Apply the monkey patch for using LoRAs with quantized models.')
-parser.add_argument('--quant_attn', action='store_true', help='(triton) Enable quant attention.')
-parser.add_argument('--warmup_autotune', action='store_true', help='(triton) Enable warmup autotune.')
-parser.add_argument('--fused_mlp', action='store_true', help='(triton) Enable fused mlp.')
 
 # AutoGPTQ
 parser.add_argument('--triton', action='store_true', help='Use triton.')
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 55416a07..e98e237c 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -110,7 +110,7 @@ def create_ui():
                             shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
                             shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
-                            shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa is currently 2x faster than AutoGPTQ on some systems. It is installed by default with the one-click installers. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
+                            shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the one-click installers. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
                             shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')
                             shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')
                             shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF is a wrapper that lets you use llama.cpp like a Transformers model, which means it can use the Transformers samplers. To use it, make sure to first download oobabooga/llama-tokenizer under "Download custom model or LoRA".')
diff --git a/requirements.txt b/requirements.txt
index e65bed6b..b27e14c5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -36,3 +36,7 @@ https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.77/llama_cpp_
 # llama-cpp-python with CUDA support
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.77+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.77+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+
+# GPTQ-for-LLaMa
+https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From e3d3565b2a538da8769fd0352067647529b2298c Mon Sep 17 00:00:00 2001
From: jllllll <3887729+jllllll@users.noreply.github.com>
Date: Wed, 9 Aug 2023 23:59:04 -0500
Subject: [PATCH 2/6] Remove GPTQ-for-LLaMa monkey patch support

AutoGPTQ will be the preferred GPTQ LoRa loader in the future.
---
 README.md                         |  1 -
 docs/GPTQ-models-(4-bit-mode).md  | 27 -------------------
 docs/LoRA.md                      |  1 -
 docs/Training-LoRAs.md            |  8 ------
 modules/monkey_patch_gptq_lora.py | 43 -------------------------------
 modules/training.py               | 23 -----------------
 6 files changed, 103 deletions(-)
 delete mode 100644 modules/monkey_patch_gptq_lora.py

diff --git a/README.md b/README.md
index ad2ad1ed..5739d0ba 100644
--- a/README.md
+++ b/README.md
@@ -279,7 +279,6 @@ Optionally, you can use the following command-line flags:
 | `--groupsize GROUPSIZE`   | Group size. |
 | `--pre_layer PRE_LAYER [PRE_LAYER ...]`  | The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg `--pre_layer 30 60`. |
 | `--checkpoint CHECKPOINT` | The path to the quantized checkpoint file. If not specified, it will be automatically detected. |
-| `--monkey-patch`          | Apply the monkey patch for using LoRAs with quantized models.
 
 #### DeepSpeed
 
diff --git a/docs/GPTQ-models-(4-bit-mode).md b/docs/GPTQ-models-(4-bit-mode).md
index 838595ef..d3869bb7 100644
--- a/docs/GPTQ-models-(4-bit-mode).md
+++ b/docs/GPTQ-models-(4-bit-mode).md
@@ -198,31 +198,4 @@ Output generated in 123.79 seconds (1.61 tokens/s, 199 tokens)
 
 You can also use multiple GPUs with `pre_layer` if using the oobabooga fork of GPTQ, eg `--pre_layer 30 60` will load a LLaMA-30B model half onto your first GPU and half onto your second, or `--pre_layer 20 40` will load 20 layers onto GPU-0, 20 layers onto GPU-1, and 20 layers offloaded to CPU.
 
-### Using LoRAs with GPTQ-for-LLaMa
-
-This requires using a monkey patch that is supported by this web UI: https://github.com/johnsmith0031/alpaca_lora_4bit
-
-To use it:
-
-1. Clone `johnsmith0031/alpaca_lora_4bit` into the repositories folder:
-
-```
-cd text-generation-webui/repositories
-git clone https://github.com/johnsmith0031/alpaca_lora_4bit
-```
-
-⚠️  I have tested it with the following commit specifically: `2f704b93c961bf202937b10aac9322b092afdce0`
-
-2. Install https://github.com/sterlind/GPTQ-for-LLaMa with this command:
-
-```
-pip install git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
-```
-
-3. Start the UI with the `--monkey-patch` flag:
-
-```
-python server.py --model llama-7b-4bit-128g --listen --lora tloen_alpaca-lora-7b --monkey-patch
-```
-
 
diff --git a/docs/LoRA.md b/docs/LoRA.md
index f1504d10..02ce55be 100644
--- a/docs/LoRA.md
+++ b/docs/LoRA.md
@@ -11,7 +11,6 @@ This is the current state of LoRA integration in the web UI:
 | Transformers | Full support in 16-bit, `--load-in-8bit`, `--load-in-4bit`, and CPU modes. |
 | ExLlama | Single LoRA support. Fast to remove the LoRA afterwards. |
 | AutoGPTQ | Single LoRA support. Removing the LoRA requires reloading the entire model.|
-| GPTQ-for-LLaMa | Full support with the [monkey patch](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#using-loras-with-gptq-for-llama). |
 
 ## Downloading a LoRA
 
diff --git a/docs/Training-LoRAs.md b/docs/Training-LoRAs.md
index 83e6d5a7..bdc79992 100644
--- a/docs/Training-LoRAs.md
+++ b/docs/Training-LoRAs.md
@@ -131,14 +131,6 @@ So, in effect, Loss is a balancing game: you want to get it low enough that it u
 
 Note: if you see Loss start at or suddenly jump to exactly `0`, it is likely something has gone wrong in your training process (eg model corruption).
 
-## Note: 4-Bit Monkeypatch
-
-The [4-bit LoRA monkeypatch](GPTQ-models-(4-bit-mode).md#using-loras-in-4-bit-mode) works for training, but has side effects:
-- VRAM usage is higher currently. You can reduce the `Micro Batch Size` to `1` to compensate.
-- Models do funky things. LoRAs apply themselves, or refuse to apply, or spontaneously error out, or etc. It can be helpful to reload base model or restart the WebUI between training/usage to minimize chances of anything going haywire.
-- Loading or working with multiple LoRAs at the same time doesn't currently work.
-- Generally, recognize and treat the monkeypatch as the dirty temporary hack it is - it works, but isn't very stable. It will get better in time when everything is merged upstream for full official support.
-
 ## Legacy notes
 
 LoRA training was contributed by [mcmonkey4eva](https://github.com/mcmonkey4eva) in PR [#570](https://github.com/oobabooga/text-generation-webui/pull/570).
diff --git a/modules/monkey_patch_gptq_lora.py b/modules/monkey_patch_gptq_lora.py
deleted file mode 100644
index bf8d478d..00000000
--- a/modules/monkey_patch_gptq_lora.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copied from https://github.com/johnsmith0031/alpaca_lora_4bit
-
-import sys
-from pathlib import Path
-
-sys.path.insert(0, str(Path("repositories/alpaca_lora_4bit")))
-
-import autograd_4bit
-from amp_wrapper import AMPWrapper
-from autograd_4bit import (
-    Autograd4bitQuantLinear,
-    load_llama_model_4bit_low_ram
-)
-from monkeypatch.peft_tuners_lora_monkey_patch import (
-    Linear4bitLt,
-    replace_peft_model_with_gptq_lora_model
-)
-
-from modules import shared
-from modules.GPTQ_loader import find_quantized_model_file
-
-replace_peft_model_with_gptq_lora_model()
-
-
-def load_model_llama(model_name):
-    config_path = str(Path(f'{shared.args.model_dir}/{model_name}'))
-    model_path = str(find_quantized_model_file(model_name))
-    model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path, groupsize=shared.args.groupsize, is_v1_model=False)
-    for n, m in model.named_modules():
-        if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
-            if m.is_v1_model:
-                m.zeros = m.zeros.half()
-            m.scales = m.scales.half()
-            m.bias = m.bias.half()
-
-    autograd_4bit.use_new = True
-    autograd_4bit.auto_switch = True
-
-    model.half()
-    wrapper = AMPWrapper(model)
-    wrapper.apply_generate()
-
-    return model, tokenizer
diff --git a/modules/training.py b/modules/training.py
index 7558cd5d..fa721ff0 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -270,12 +270,6 @@ def calc_trainable_parameters(model):
 
 def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, overlap_len: int, newline_favor_len: int, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str):
 
-    if shared.args.monkey_patch:
-        from monkeypatch.peft_tuners_lora_monkey_patch import (
-            replace_peft_model_with_gptq_lora_model
-        )
-        replace_peft_model_with_gptq_lora_model()
-
     global WANT_INTERRUPT
     WANT_INTERRUPT = False
 
@@ -307,15 +301,6 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
 
         time.sleep(5)
 
-    if shared.args.wbits > 0 and not shared.args.monkey_patch:
-        yield "LoRA training with GPTQ models requires loading with `--monkey-patch`"
-        return
-
-    elif not (shared.args.load_in_8bit or shared.args.load_in_4bit) and shared.args.wbits <= 0:
-        yield "It is highly recommended you use `--load-in-8bit` for LoRA training. *(Will continue anyway in 2 seconds, press `Interrupt` to stop.)*"
-        logger.warning("It is highly recommended you use `--load-in-8bit` for LoRA training.")
-        time.sleep(2)  # Give it a moment for the message to show in UI before continuing
-
     if cutoff_len <= 0 or micro_batch_size <= 0 or batch_size <= 0 or actual_lr <= 0 or lora_rank <= 0 or lora_alpha <= 0:
         yield "Cannot input zeroes."
         return
@@ -520,14 +505,6 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
         yield traceback.format_exc().replace('\n', '\n\n')
         return
 
-    if shared.args.monkey_patch:
-        for n, m in lora_model.named_modules():
-            if '4bit' in str(type(m)):
-                if m.is_v1_model:
-                    m.zeros = m.zeros.half()
-
-                m.scales = m.scales.half()
-
     class Tracked():
         def __init__(self):
             self.current_steps = 0

From d7ee4c23862081f6b8dbbaac8f22e7bc519da172 Mon Sep 17 00:00:00 2001
From: jllllll <3887729+jllllll@users.noreply.github.com>
Date: Thu, 10 Aug 2023 00:10:14 -0500
Subject: [PATCH 3/6] Remove unused import

---
 modules/GPTQ_loader.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py
index c0cef476..bc528b18 100644
--- a/modules/GPTQ_loader.py
+++ b/modules/GPTQ_loader.py
@@ -1,6 +1,5 @@
 import inspect
 import re
-import sys
 from pathlib import Path
 
 import accelerate

From d6765bebc4920827200ee5779e2441dec65763e1 Mon Sep 17 00:00:00 2001
From: jllllll <3887729+jllllll@users.noreply.github.com>
Date: Thu, 10 Aug 2023 00:53:48 -0500
Subject: [PATCH 4/6] Update installation documentation

---
 docs/GPTQ-models-(4-bit-mode).md | 55 ++++----------------------------
 modules/ui_model_menu.py         |  2 +-
 2 files changed, 8 insertions(+), 49 deletions(-)

diff --git a/docs/GPTQ-models-(4-bit-mode).md b/docs/GPTQ-models-(4-bit-mode).md
index d3869bb7..e8d983eb 100644
--- a/docs/GPTQ-models-(4-bit-mode).md
+++ b/docs/GPTQ-models-(4-bit-mode).md
@@ -70,53 +70,13 @@ Not supported yet.
 
 GPTQ-for-LLaMa is the original adaptation of GPTQ for the LLaMA model. It was made possible by [@qwopqwop200](https://github.com/qwopqwop200/GPTQ-for-LLaMa): https://github.com/qwopqwop200/GPTQ-for-LLaMa
 
-Different branches of GPTQ-for-LLaMa are currently available, including:
-
-| Branch | Comment |
-|----|----|
-| [Old CUDA branch (recommended)](https://github.com/oobabooga/GPTQ-for-LLaMa/) | The fastest branch, works on Windows and Linux. |
-| [Up-to-date triton branch](https://github.com/qwopqwop200/GPTQ-for-LLaMa) | Slightly more precise than the old CUDA branch from 13b upwards, significantly more precise for 7b. 2x slower for small context size and only works on Linux. |
-| [Up-to-date CUDA branch](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda) | As precise as the up-to-date triton branch, 10x slower than the old cuda branch for small context size. |
-
-Overall, I recommend using the old CUDA branch. It is included by default in the one-click-installer for this web UI.
-
-### Installation
-
-Start by cloning GPTQ-for-LLaMa into your `text-generation-webui/repositories` folder:
-
-```
-mkdir repositories
-cd repositories
-git clone https://github.com/oobabooga/GPTQ-for-LLaMa.git -b cuda
-```
-
-If you want to you to use the up-to-date CUDA or triton branches instead of the old CUDA branch, use these commands:
-
-```
-git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git -b cuda
-```
-
-```
-git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git -b triton
-```
-
-Next you need to install the CUDA extensions. You can do that either by installing the precompiled wheels, or by compiling the wheels yourself.
+A Python package containing both major CUDA versions of GPTQ-for-LLaMa is used to simplify installation and compatibility: https://github.com/jllllll/GPTQ-for-LLaMa-CUDA
 
 ### Precompiled wheels
 
-Kindly provided by our friend jllllll: https://github.com/jllllll/GPTQ-for-LLaMa-Wheels
+Kindly provided by our friend jllllll: https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases
 
-Windows:
-
-```
-pip install https://github.com/jllllll/GPTQ-for-LLaMa-Wheels/raw/main/quant_cuda-0.0.0-cp310-cp310-win_amd64.whl
-```
-
-Linux:
-
-```
-pip install https://github.com/jllllll/GPTQ-for-LLaMa-Wheels/raw/Linux-x64/quant_cuda-0.0.0-cp310-cp310-linux_x86_64.whl
-```
+Wheels are included in requirements.txt and are installed with the webui on supported systems.
 
 ### Manual installation
 
@@ -124,20 +84,19 @@ pip install https://github.com/jllllll/GPTQ-for-LLaMa-Wheels/raw/Linux-x64/quant
 
 ```
 conda activate textgen
-conda install -c conda-forge cudatoolkit-dev
+conda install cuda -c nvidia/label/cuda-11.7.1
 ```
 
 The command above takes some 10 minutes to run and shows no progress bar or updates along the way.
 
-You are also going to need to have a C++ compiler installed. On Linux, `sudo apt install build-essential` or equivalent is enough.
+You are also going to need to have a C++ compiler installed. On Linux, `sudo apt install build-essential` or equivalent is enough. On Windows, Visual Studio or Visual Studio Build Tools is required.
 
-If you're using an older version of CUDA toolkit (e.g. 11.7) but the latest version of `gcc` and `g++` (12.0+), you should downgrade with: `conda install -c conda-forge gxx==11.3.0`. Kernel compilation will fail otherwise.
+If you're using an older version of CUDA toolkit (e.g. 11.7) but the latest version of `gcc` and `g++` (12.0+) on Linux, you should downgrade with: `conda install -c conda-forge gxx==11.3.0`. Kernel compilation will fail otherwise.
 
 #### Step 2: compile the CUDA extensions
 
 ```
-cd repositories/GPTQ-for-LLaMa
-python setup_cuda.py install
+python -m pip install git+https://github.com/jllllll/GPTQ-for-LLaMa-CUDA -v
 ```
 
 ### Getting pre-converted LLaMA weights
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index e98e237c..0c1042f6 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -110,7 +110,7 @@ def create_ui():
                             shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
                             shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
-                            shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the one-click installers. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
+                            shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
                             shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')
                             shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')
                             shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF is a wrapper that lets you use llama.cpp like a Transformers model, which means it can use the Transformers samplers. To use it, make sure to first download oobabooga/llama-tokenizer under "Download custom model or LoRA".')

From 16e2b117b415074afd2917a72496b776debfcd58 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 10 Aug 2023 08:38:10 -0700
Subject: [PATCH 5/6] Minor doc change

---
 docs/GPTQ-models-(4-bit-mode).md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/GPTQ-models-(4-bit-mode).md b/docs/GPTQ-models-(4-bit-mode).md
index e8d983eb..b42f4224 100644
--- a/docs/GPTQ-models-(4-bit-mode).md
+++ b/docs/GPTQ-models-(4-bit-mode).md
@@ -64,7 +64,7 @@ python server.py --autogptq --gpu-memory 3000MiB 6000MiB --model model_name
 
 ### Using LoRAs with AutoGPTQ
 
-Not supported yet.
+Works fine for a single LoRA.
 
 ## GPTQ-for-LLaMa
 

From c7f52bbdc106896b8f839c442b0c82937f006fd8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 10 Aug 2023 08:39:41 -0700
Subject: [PATCH 6/6] Revert "Remove GPTQ-for-LLaMa monkey patch support"

This reverts commit e3d3565b2a538da8769fd0352067647529b2298c.
---
 README.md                         |  1 +
 docs/GPTQ-models-(4-bit-mode).md  | 27 +++++++++++++++++++
 docs/LoRA.md                      |  1 +
 docs/Training-LoRAs.md            |  8 ++++++
 modules/monkey_patch_gptq_lora.py | 43 +++++++++++++++++++++++++++++++
 modules/training.py               | 23 +++++++++++++++++
 6 files changed, 103 insertions(+)
 create mode 100644 modules/monkey_patch_gptq_lora.py

diff --git a/README.md b/README.md
index 5739d0ba..ad2ad1ed 100644
--- a/README.md
+++ b/README.md
@@ -279,6 +279,7 @@ Optionally, you can use the following command-line flags:
 | `--groupsize GROUPSIZE`   | Group size. |
 | `--pre_layer PRE_LAYER [PRE_LAYER ...]`  | The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg `--pre_layer 30 60`. |
 | `--checkpoint CHECKPOINT` | The path to the quantized checkpoint file. If not specified, it will be automatically detected. |
+| `--monkey-patch`          | Apply the monkey patch for using LoRAs with quantized models.
 
 #### DeepSpeed
 
diff --git a/docs/GPTQ-models-(4-bit-mode).md b/docs/GPTQ-models-(4-bit-mode).md
index b42f4224..428d7560 100644
--- a/docs/GPTQ-models-(4-bit-mode).md
+++ b/docs/GPTQ-models-(4-bit-mode).md
@@ -157,4 +157,31 @@ Output generated in 123.79 seconds (1.61 tokens/s, 199 tokens)
 
 You can also use multiple GPUs with `pre_layer` if using the oobabooga fork of GPTQ, eg `--pre_layer 30 60` will load a LLaMA-30B model half onto your first GPU and half onto your second, or `--pre_layer 20 40` will load 20 layers onto GPU-0, 20 layers onto GPU-1, and 20 layers offloaded to CPU.
 
+### Using LoRAs with GPTQ-for-LLaMa
+
+This requires using a monkey patch that is supported by this web UI: https://github.com/johnsmith0031/alpaca_lora_4bit
+
+To use it:
+
+1. Clone `johnsmith0031/alpaca_lora_4bit` into the repositories folder:
+
+```
+cd text-generation-webui/repositories
+git clone https://github.com/johnsmith0031/alpaca_lora_4bit
+```
+
+⚠️  I have tested it with the following commit specifically: `2f704b93c961bf202937b10aac9322b092afdce0`
+
+2. Install https://github.com/sterlind/GPTQ-for-LLaMa with this command:
+
+```
+pip install git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
+```
+
+3. Start the UI with the `--monkey-patch` flag:
+
+```
+python server.py --model llama-7b-4bit-128g --listen --lora tloen_alpaca-lora-7b --monkey-patch
+```
+
 
diff --git a/docs/LoRA.md b/docs/LoRA.md
index 02ce55be..f1504d10 100644
--- a/docs/LoRA.md
+++ b/docs/LoRA.md
@@ -11,6 +11,7 @@ This is the current state of LoRA integration in the web UI:
 | Transformers | Full support in 16-bit, `--load-in-8bit`, `--load-in-4bit`, and CPU modes. |
 | ExLlama | Single LoRA support. Fast to remove the LoRA afterwards. |
 | AutoGPTQ | Single LoRA support. Removing the LoRA requires reloading the entire model.|
+| GPTQ-for-LLaMa | Full support with the [monkey patch](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#using-loras-with-gptq-for-llama). |
 
 ## Downloading a LoRA
 
diff --git a/docs/Training-LoRAs.md b/docs/Training-LoRAs.md
index bdc79992..83e6d5a7 100644
--- a/docs/Training-LoRAs.md
+++ b/docs/Training-LoRAs.md
@@ -131,6 +131,14 @@ So, in effect, Loss is a balancing game: you want to get it low enough that it u
 
 Note: if you see Loss start at or suddenly jump to exactly `0`, it is likely something has gone wrong in your training process (eg model corruption).
 
+## Note: 4-Bit Monkeypatch
+
+The [4-bit LoRA monkeypatch](GPTQ-models-(4-bit-mode).md#using-loras-in-4-bit-mode) works for training, but has side effects:
+- VRAM usage is higher currently. You can reduce the `Micro Batch Size` to `1` to compensate.
+- Models do funky things. LoRAs apply themselves, or refuse to apply, or spontaneously error out, or etc. It can be helpful to reload base model or restart the WebUI between training/usage to minimize chances of anything going haywire.
+- Loading or working with multiple LoRAs at the same time doesn't currently work.
+- Generally, recognize and treat the monkeypatch as the dirty temporary hack it is - it works, but isn't very stable. It will get better in time when everything is merged upstream for full official support.
+
 ## Legacy notes
 
 LoRA training was contributed by [mcmonkey4eva](https://github.com/mcmonkey4eva) in PR [#570](https://github.com/oobabooga/text-generation-webui/pull/570).
diff --git a/modules/monkey_patch_gptq_lora.py b/modules/monkey_patch_gptq_lora.py
new file mode 100644
index 00000000..bf8d478d
--- /dev/null
+++ b/modules/monkey_patch_gptq_lora.py
@@ -0,0 +1,43 @@
+# Copied from https://github.com/johnsmith0031/alpaca_lora_4bit
+
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path("repositories/alpaca_lora_4bit")))
+
+import autograd_4bit
+from amp_wrapper import AMPWrapper
+from autograd_4bit import (
+    Autograd4bitQuantLinear,
+    load_llama_model_4bit_low_ram
+)
+from monkeypatch.peft_tuners_lora_monkey_patch import (
+    Linear4bitLt,
+    replace_peft_model_with_gptq_lora_model
+)
+
+from modules import shared
+from modules.GPTQ_loader import find_quantized_model_file
+
+replace_peft_model_with_gptq_lora_model()
+
+
+def load_model_llama(model_name):
+    config_path = str(Path(f'{shared.args.model_dir}/{model_name}'))
+    model_path = str(find_quantized_model_file(model_name))
+    model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path, groupsize=shared.args.groupsize, is_v1_model=False)
+    for n, m in model.named_modules():
+        if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
+            if m.is_v1_model:
+                m.zeros = m.zeros.half()
+            m.scales = m.scales.half()
+            m.bias = m.bias.half()
+
+    autograd_4bit.use_new = True
+    autograd_4bit.auto_switch = True
+
+    model.half()
+    wrapper = AMPWrapper(model)
+    wrapper.apply_generate()
+
+    return model, tokenizer
diff --git a/modules/training.py b/modules/training.py
index fa721ff0..7558cd5d 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -270,6 +270,12 @@ def calc_trainable_parameters(model):
 
 def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, overlap_len: int, newline_favor_len: int, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str):
 
+    if shared.args.monkey_patch:
+        from monkeypatch.peft_tuners_lora_monkey_patch import (
+            replace_peft_model_with_gptq_lora_model
+        )
+        replace_peft_model_with_gptq_lora_model()
+
     global WANT_INTERRUPT
     WANT_INTERRUPT = False
 
@@ -301,6 +307,15 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
 
         time.sleep(5)
 
+    if shared.args.wbits > 0 and not shared.args.monkey_patch:
+        yield "LoRA training with GPTQ models requires loading with `--monkey-patch`"
+        return
+
+    elif not (shared.args.load_in_8bit or shared.args.load_in_4bit) and shared.args.wbits <= 0:
+        yield "It is highly recommended you use `--load-in-8bit` for LoRA training. *(Will continue anyway in 2 seconds, press `Interrupt` to stop.)*"
+        logger.warning("It is highly recommended you use `--load-in-8bit` for LoRA training.")
+        time.sleep(2)  # Give it a moment for the message to show in UI before continuing
+
     if cutoff_len <= 0 or micro_batch_size <= 0 or batch_size <= 0 or actual_lr <= 0 or lora_rank <= 0 or lora_alpha <= 0:
         yield "Cannot input zeroes."
         return
@@ -505,6 +520,14 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
         yield traceback.format_exc().replace('\n', '\n\n')
         return
 
+    if shared.args.monkey_patch:
+        for n, m in lora_model.named_modules():
+            if '4bit' in str(type(m)):
+                if m.is_v1_model:
+                    m.zeros = m.zeros.half()
+
+                m.scales = m.scales.half()
+
     class Tracked():
         def __init__(self):
             self.current_steps = 0