Merge pull request #6271 from oobabooga/dev

Merge dev branch
2024-10-01 01:26:03 -04:00 · 2024-07-25 12:12:04 -03:00 · 2024-07-25 12:12:04 -03:00 · dd97a83534
commit dd97a83534
parent af839d20ac e4624fbc68
28 changed files with 135 additions and 123 deletions
--- a/Colab-TextGen-GPU.ipynb
+++ b/Colab-TextGen-GPU.ipynb
@ -22,7 +22,7 @@
      "source": [
        "# oobabooga/text-generation-webui\n",
        "\n",
-        "After running both cells, a public gradio URL will appear at the bottom in a few minutes. You can optionally generate an API link.\n",
+        "After running both cells, a public gradio URL will appear at the bottom in around 10 minutes. You can optionally generate an API link.\n",
        "\n",
        "* Project page: https://github.com/oobabooga/text-generation-webui\n",
        "* Gradio server status: https://status.gradio.app/"
@ -53,44 +53,28 @@
        "\n",
        "#@markdown If unsure about the branch, write \"main\" or leave it blank.\n",
        "\n",
-        "import torch\n",
+        "import os\n",
        "from pathlib import Path\n",
        "\n",
+        "os.environ.pop('PYTHONPATH', None)\n",
+        "\n",
        "if Path.cwd().name != 'text-generation-webui':\n",
-        "  print(\"Installing the webui...\")\n",
+        "  print(\"\\033[1;32;1m\\n --> Installing the web UI. This will take a while, but after the initial setup, you can download and test as many models as you like.\\033[0;37;0m\\n\")\n",
        "\n",
        "  !git clone https://github.com/oobabooga/text-generation-webui\n",
        "  %cd text-generation-webui\n",
        "\n",
-        "  torver = torch.__version__\n",
-        "  print(f\"TORCH: {torver}\")\n",
-        "  is_cuda118 = '+cu118' in torver  # 2.1.0+cu118\n",
-        "\n",
-        "  if is_cuda118:\n",
-        "    !python -m pip install --upgrade torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu118\n",
-        "  else:\n",
-        "    !python -m pip install --upgrade torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121\n",
-        "\n",
-        "  textgen_requirements = open('requirements.txt').read().splitlines()\n",
-        "  if is_cuda118:\n",
-        "      textgen_requirements = [req.replace('+cu121', '+cu118').replace('+cu122', '+cu118') for req in textgen_requirements]\n",
-        "  with open('temp_requirements.txt', 'w') as file:\n",
-        "      file.write('\\n'.join(textgen_requirements))\n",
-        "\n",
-        "  !pip install -r temp_requirements.txt --upgrade\n",
-        "\n",
-        "  print(\"\\033[1;32;1m\\n --> If you see a warning about \\\"previously imported packages\\\", just ignore it.\\033[0;37;0m\")\n",
-        "  print(\"\\033[1;32;1m\\n --> There is no need to restart the runtime.\\n\\033[0;37;0m\")\n",
-        "\n",
-        "  try:\n",
-        "    import flash_attn\n",
-        "  except:\n",
-        "    !pip uninstall -y flash_attn\n",
+        "  # Install the project in an isolated environment\n",
+        "  !GPU_CHOICE=A \\\n",
+        "  USE_CUDA118=FALSE \\\n",
+        "  LAUNCH_AFTER_INSTALL=FALSE \\\n",
+        "  INSTALL_EXTENSIONS=FALSE \\\n",
+        "  ./start_linux.sh\n",
        "\n",
        "# Parameters\n",
-        "model_url = \"https://huggingface.co/TheBloke/MythoMax-L2-13B-GPTQ\" #@param {type:\"string\"}\n",
-        "branch = \"gptq-4bit-32g-actorder_True\" #@param {type:\"string\"}\n",
-        "command_line_flags = \"--n-gpu-layers 128 --load-in-4bit --use_double_quant\" #@param {type:\"string\"}\n",
+        "model_url = \"https://huggingface.co/turboderp/gemma-2-9b-it-exl2\" #@param {type:\"string\"}\n",
+        "branch = \"8.0bpw\" #@param {type:\"string\"}\n",
+        "command_line_flags = \"--n-gpu-layers 128 --load-in-4bit --use_double_quant --no_flash_attn\" #@param {type:\"string\"}\n",
        "api = False #@param {type:\"boolean\"}\n",
        "\n",
        "if api:\n",
@ -116,11 +100,10 @@
        "    output_folder = \"\"\n",
        "\n",
        "# Start the web UI\n",
-        "cmd = f\"python server.py --share\"\n",
+        "cmd = f\"./start_linux.sh {command_line_flags} --share\"\n",
        "if output_folder != \"\":\n",
        "    cmd += f\" --model {output_folder}\"\n",
-        "cmd += f\" {command_line_flags}\"\n",
-        "print(cmd)\n",
+        "\n",
        "!$cmd"
      ],
      "metadata": {
--- a/cmd_linux.sh
+++ b/cmd_linux.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 cd "$(dirname "${BASH_SOURCE[0]}")"

--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@ -39,14 +39,6 @@
    margin-bottom: 0 !important;
 }

-.dark .message-body p em {
-    color: rgb(198 202 214) !important;
-}
-
-.message-body p em {
-    color: rgb(110 110 110) !important;
-}
-
 .gradio-container .chat .assistant-message {
    padding: 20px;
    background: #f4f4f4;
--- a/css/main.css
+++ b/css/main.css
@ -406,6 +406,14 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
    color: var(--body-text-color);
 }

+.dark .message q {
+    color: #f5b031;
+}
+
+.message q::before, .message q::after {
+    content: "";
+}
+
 .message-body li {
    list-style-position: outside;
 }
--- a/js/main.js
+++ b/js/main.js
@ -213,12 +213,10 @@ function doSyntaxHighlighting() {
      renderMathInElement(element, {
        delimiters: [
          { left: "$$", right: "$$", display: true },
-          { left: "$", right: "$", display: false },
          { left: "\\(", right: "\\)", display: false },
          { left: "\\[", right: "\\]", display: true },
        ],
      });
-
    });

    observer.observe(targetElement, config);
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@ -72,8 +72,6 @@ def add_lora_autogptq(lora_names):
    else:
        if len(lora_names) > 1:
            logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
-        if not shared.args.no_inject_fused_attention:
-            logger.warning('Fused Attention + AutoGPTQ may break Lora loading. Disable it.')

        peft_config = GPTQLoraConfig(
            inference_mode=True,
--- a/modules/chat.py
+++ b/modules/chat.py
@ -17,7 +17,11 @@ from PIL import Image
 import modules.shared as shared
 from modules import utils
 from modules.extensions import apply_extensions
-from modules.html_generator import chat_html_wrapper, make_thumbnail
+from modules.html_generator import (
+    chat_html_wrapper,
+    convert_to_markdown,
+    make_thumbnail
+)
 from modules.logging_colors import logger
 from modules.text_generation import (
    generate_reply,
@ -368,7 +372,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess


 def impersonate_wrapper(text, state):
-
    static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

    prompt = generate_chat_prompt('', state, impersonate=True)
@ -488,7 +491,7 @@ def start_new_chat(state):
        greeting = replace_character_names(state['greeting'], state['name1'], state['name2'])
        if greeting != '':
            history['internal'] += [['<|BEGIN-VISIBLE-CHAT|>', greeting]]
-            history['visible'] += [['', apply_extensions('output', greeting, state, is_chat=True)]]
+            history['visible'] += [['', apply_extensions('output', html.escape(greeting), state, is_chat=True)]]

    unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
    save_history(history, unique_id, state['character_menu'], state['mode'])
@ -1044,6 +1047,8 @@ def handle_unique_id_select(state):
    history = load_history(state['unique_id'], state['character_menu'], state['mode'])
    html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

+    convert_to_markdown.cache_clear()
+
    return [history, html]


@ -1052,6 +1057,8 @@ def handle_start_new_chat_click(state):
    histories = find_all_histories_with_first_prompts(state)
    html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

+    convert_to_markdown.cache_clear()
+
    return [history, html, gr.update(choices=histories, value=histories[0][1])]


@ -1061,6 +1068,8 @@ def handle_delete_chat_confirm_click(state):
    history, unique_id = load_history_after_deletion(state, index)
    html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

+    convert_to_markdown.cache_clear()
+
    return [
        history,
        html,
@ -1099,6 +1108,8 @@ def handle_upload_chat_history(load_chat_history, state):

    html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

+    convert_to_markdown.cache_clear()
+
    return [
        history,
        html,
@ -1119,6 +1130,8 @@ def handle_character_menu_change(state):
    histories = find_all_histories_with_first_prompts(state)
    html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

+    convert_to_markdown.cache_clear()
+
    return [
        history,
        html,
@ -1136,6 +1149,8 @@ def handle_mode_change(state):
    histories = find_all_histories_with_first_prompts(state)
    html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

+    convert_to_markdown.cache_clear()
+
    return [
        history,
        html,
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@ -42,13 +42,39 @@ def fix_newlines(string):
    return string


+def replace_quotes(text):
+
+    # Define a list of quote pairs (opening and closing), using HTML entities
+    quote_pairs = [
+        ('&quot;', '&quot;'),  # Double quotes
+        ('&ldquo;', '&rdquo;'),  # Unicode left and right double quotation marks
+        ('&lsquo;', '&rsquo;'),  # Unicode left and right single quotation marks
+        ('&laquo;', '&raquo;'),  # French quotes
+        ('&bdquo;', '&ldquo;'),  # German quotes
+        ('&lsquo;', '&rsquo;'),  # Alternative single quotes
+        ('&#8220;', '&#8221;'),  # Unicode quotes (numeric entities)
+        ('&#x201C;', '&#x201D;'),  # Unicode quotes (hex entities)
+    ]
+
+    # Create a regex pattern that matches any of the quote pairs, including newlines
+    pattern = '|'.join(f'({re.escape(open_q)})(.*?)({re.escape(close_q)})' for open_q, close_q in quote_pairs)
+
+    # Replace matched patterns with <q> tags, keeping original quotes
+    replaced_text = re.sub(pattern, lambda m: f'<q>{m.group(1)}{m.group(2)}{m.group(3)}</q>', text, flags=re.DOTALL)
+
+    return replaced_text
+
+
 def replace_blockquote(m):
    return m.group().replace('\n', '\n> ').replace('\\begin{blockquote}', '').replace('\\end{blockquote}', '')


-@functools.lru_cache(maxsize=4096)
+@functools.lru_cache(maxsize=None)
 def convert_to_markdown(string):

+    # Quote to <q></q>
+    string = replace_quotes(string)
+
    # Blockquote
    string = re.sub(r'(^|[\n])&gt;', r'\1>', string)
    pattern = re.compile(r'\\begin{blockquote}(.*?)\\end{blockquote}', re.DOTALL)
@ -124,6 +150,7 @@ def convert_to_markdown_wrapped(string, use_cache=True):


 def generate_basic_html(string):
+    convert_to_markdown.cache_clear()
    string = convert_to_markdown(string)
    string = f'<style>{readable_css}</style><div class="readable-container">{string}</div>'
    return string
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -127,15 +127,6 @@ loaders_and_params = OrderedDict({
        'no_use_fast',
        'autogptq_info',
    ],
-    'AutoAWQ': [
-        'cpu_memory',
-        'gpu_memory',
-        'auto_devices',
-        'max_seq_len',
-        'no_inject_fused_attention',
-        'trust_remote_code',
-        'no_use_fast',
-    ],
    'HQQ': [
        'hqq_backend',
        'trust_remote_code',
@ -200,7 +191,6 @@ def transformers_samplers():
 loaders_samplers = {
    'Transformers': transformers_samplers(),
    'AutoGPTQ': transformers_samplers(),
-    'AutoAWQ': transformers_samplers(),
    'HQQ': transformers_samplers(),
    'ExLlamav2': {
        'temperature',
--- a/modules/models.py
+++ b/modules/models.py
@ -75,7 +75,6 @@ def load_model(model_name, loader=None):
        'llamacpp_HF': llamacpp_HF_loader,
        'ExLlamav2': ExLlamav2_loader,
        'ExLlamav2_HF': ExLlamav2_HF_loader,
-        'AutoAWQ': AutoAWQ_loader,
        'HQQ': HQQ_loader,
        'TensorRT-LLM': TensorRT_LLM_loader,
    }
@ -292,24 +291,6 @@ def llamacpp_HF_loader(model_name):
    return model


-def AutoAWQ_loader(model_name):
-    from awq import AutoAWQForCausalLM
-
-    model_dir = Path(f'{shared.args.model_dir}/{model_name}')
-
-    model = AutoAWQForCausalLM.from_quantized(
-        quant_path=model_dir,
-        max_new_tokens=shared.args.max_seq_len,
-        trust_remote_code=shared.args.trust_remote_code,
-        fuse_layers=not shared.args.no_inject_fused_attention,
-        max_memory=get_max_memory_dict(),
-        batch_size=1,
-        safetensors=any(model_dir.glob('*.safetensors')),
-    )
-
-    return model
-
-
 def AutoGPTQ_loader(model_name):
    import modules.AutoGPTQ_loader

--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@ -180,8 +180,6 @@ def infer_loader(model_name, model_settings):
        loader = None
    elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0):
        loader = 'ExLlamav2_HF'
-    elif (path_to_model / 'quant_config.json').exists() or re.match(r'.*-awq', model_name.lower()):
-        loader = 'AutoAWQ'
    elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists():
        loader = 'llamacpp_HF'
    elif len(list(path_to_model.glob('*.gguf'))) > 0:
--- a/modules/shared.py
+++ b/modules/shared.py
@ -89,7 +89,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft

 # Model loader
 group = parser.add_argument_group('Model loader')
-group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ.')
+group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ.')

 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
@ -160,10 +160,6 @@ group.add_argument('--disable_exllamav2', action='store_true', help='Disable ExL
 group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
 group.add_argument('--groupsize', type=int, default=-1, help='Group size.')

-# AutoAWQ
-group = parser.add_argument_group('AutoAWQ')
-group.add_argument('--no_inject_fused_attention', action='store_true', help='Disable the use of fused attention, which will use less VRAM at the cost of slower inference.')
-
 # HQQ
 group = parser.add_argument_group('HQQ')
 group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
@ -217,6 +213,7 @@ group.add_argument('--model_type', type=str, help='DEPRECATED')
 group.add_argument('--pre_layer', type=int, nargs='+', help='DEPRECATED')
 group.add_argument('--checkpoint', type=str, help='DEPRECATED')
 group.add_argument('--monkey-patch', action='store_true', help='DEPRECATED')
+group.add_argument('--no_inject_fused_attention', action='store_true', help='DEPRECATED')

 args = parser.parse_args()
 args_defaults = parser.parse_args([])
@ -267,8 +264,6 @@ def fix_loader_name(name):
        return 'ExLlamav2'
    elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
        return 'ExLlamav2_HF'
-    elif name in ['autoawq', 'awq', 'auto-awq']:
-        return 'AutoAWQ'
    elif name in ['hqq']:
        return 'HQQ'
    elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']:
--- a/modules/ui.py
+++ b/modules/ui.py
@ -78,7 +78,6 @@ def list_model_elements():
        'groupsize',
        'triton',
        'desc_act',
-        'no_inject_fused_attention',
        'no_inject_fused_mlp',
        'no_use_cuda_fp16',
        'disable_exllama',
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@ -84,13 +84,13 @@ def create_ui():
                    shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])

                with gr.Row():
-                    shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
+                    shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'chat-instruct'] else None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')

                with gr.Row():
                    shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')

                with gr.Row():
-                    shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=False, elem_classes=['add_scrollbar'])
+                    shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar'])


 def create_chat_settings_ui():
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -127,7 +127,6 @@ def create_ui():
                            shared.gradio['no_offload_kqv'] = gr.Checkbox(label="no_offload_kqv", value=shared.args.no_offload_kqv, info='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
                            shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.')
                            shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
-                            shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.')
                            shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
                            shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
                            shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
--- a/one_click.py
+++ b/one_click.py
@ -388,7 +388,12 @@ def update_requirements(initial_installation=False, pull=True):
    # Prepare the requirements file
    textgen_requirements = open(requirements_file).read().splitlines()
    if is_cuda118:
-        textgen_requirements = [req.replace('+cu121', '+cu118').replace('+cu122', '+cu118') for req in textgen_requirements if "auto-gptq" not in req]
+        textgen_requirements = [
+            req.replace('+cu121', '+cu118').replace('+cu122', '+cu118')
+            for req in textgen_requirements
+            if "auto-gptq" not in req.lower() and "autoawq" not in req.lower()
+        ]
+
    if is_windows() and is_cuda118:  # No flash-attention on Windows for CUDA 11
        textgen_requirements = [req for req in textgen_requirements if 'oobabooga/flash-attention' not in req]

--- a/requirements.txt
+++ b/requirements.txt
@ -24,7 +24,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.42.*
+transformers==4.43.*
 tqdm
 wandb

@ -53,12 +53,20 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.83+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"

 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.42.*
+transformers==4.43.*
 tqdm
 wandb

@ -40,6 +40,10 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.83+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.83+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.42.*
+transformers==4.43.*
 tqdm
 wandb

@ -38,6 +38,10 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.83+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"

 # AMD wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.42.*
+transformers==4.43.*
 tqdm
 wandb

@ -36,4 +36,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/me
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.83-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.83-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.83-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.42.*
+transformers==4.43.*
 tqdm
 wandb

@ -38,4 +38,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/me
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.83-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.83-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.83-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.42.*
+transformers==4.43.*
 tqdm
 wandb

--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.42.*
+transformers==4.43.*
 tqdm
 wandb

--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@ -24,7 +24,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.42.*
+transformers==4.43.*
 tqdm
 wandb

@ -53,12 +53,20 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.83+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"

 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.42.*
+transformers==4.43.*
 tqdm
 wandb

--- a/server.py
+++ b/server.py
@ -90,7 +90,7 @@ def create_interface():
    # Force some events to be triggered on page load
    shared.persistent_interface_state.update({
        'loader': shared.args.loader or 'Transformers',
-        'mode': shared.settings['mode'],
+        'mode': shared.settings['mode'] if shared.settings['mode'] == 'instruct' else gr.update(),
        'character_menu': shared.args.character or shared.settings['character'],
        'instruction_template_str': shared.settings['instruction_template_str'],
        'prompt_menu-default': shared.settings['prompt-default'],
--- a/start_linux.sh
+++ b/start_linux.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 cd "$(dirname "${BASH_SOURCE[0]}")"

--- a/update_wizard_linux.sh
+++ b/update_wizard_linux.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 cd "$(dirname "${BASH_SOURCE[0]}")"