From e0ca49ed9cd231da6ed2de52da435428622f7904 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 18 Nov 2023 00:31:27 -0300
Subject: [PATCH 01/18] Bump llama-cpp-python to 0.2.18 (2nd attempt) (#4637)

* Update requirements*.txt

* Add back seed
---
 README.md                        |  1 -
 modules/llamacpp_hf.py           | 11 +++++------
 modules/llamacpp_model.py        | 10 +++++-----
 modules/loaders.py               |  2 +-
 modules/shared.py                |  2 +-
 modules/ui.py                    |  1 -
 modules/ui_model_menu.py         |  1 -
 requirements.txt                 | 32 ++++++++++++++++----------------
 requirements_amd.txt             | 24 ++++++++++++------------
 requirements_amd_noavx2.txt      | 16 ++++++++--------
 requirements_apple_intel.txt     | 28 ++++++++++++----------------
 requirements_apple_silicon.txt   | 32 ++++++++++++++++----------------
 requirements_cpu_only.txt        | 16 ++++++++--------
 requirements_cpu_only_noavx2.txt | 16 ++++++++--------
 requirements_noavx2.txt          | 32 ++++++++++++++++----------------
 15 files changed, 108 insertions(+), 116 deletions(-)

diff --git a/README.md b/README.md
index 00f5ce81..3726b26c 100644
--- a/README.md
+++ b/README.md
@@ -325,7 +325,6 @@ Optionally, you can use the following command-line flags:
 | `--mlock`     | Force the system to keep the model in RAM. |
 | `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. |
 | `--tensor_split TENSOR_SPLIT`       | Split the model across multiple GPUs. Comma-separated list of proportions. Example: 18,17. |
-| `--llama_cpp_seed SEED`             | Seed for llama-cpp models. Default is 0 (random). |
 | `--numa`      | Activate NUMA task allocation for llama.cpp. |
 | `--logits_all`| Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower. |
 | `--cache-capacity CACHE_CAPACITY`   | Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index 5d42e94c..06a66302 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -39,7 +39,7 @@ class LlamacppHF(PreTrainedModel):
             'n_tokens': self.model.n_tokens,
             'input_ids': self.model.input_ids,
             'scores': self.model.scores,
-            'ctx': self.model.ctx
+            'ctx': self.model._ctx
         }
 
         if shared.args.cfg_cache:
@@ -65,7 +65,7 @@ class LlamacppHF(PreTrainedModel):
             'n_tokens': self.model.n_tokens,
             'input_ids': self.model.input_ids,
             'scores': self.model.scores,
-            'ctx': self.model.ctx
+            'ctx': self.model._ctx
         })
 
     def save_negative_cache(self):
@@ -73,20 +73,20 @@ class LlamacppHF(PreTrainedModel):
             'n_tokens': self.model.n_tokens,
             'input_ids': self.model.input_ids,
             'scores': self.model.scores,
-            'ctx': self.model.ctx
+            'ctx': self.model._ctx
         })
 
     def load_cache(self):
         self.model.n_tokens = self.llamacpp_cache['n_tokens']
         self.model.input_ids = self.llamacpp_cache['input_ids']
         self.model.scores = self.llamacpp_cache['scores']
-        self.model.ctx = self.llamacpp_cache['ctx']
+        self.model._ctx = self.llamacpp_cache['ctx']
 
     def load_negative_cache(self):
         self.model.n_tokens = self.llamacpp_cache_negative['n_tokens']
         self.model.input_ids = self.llamacpp_cache_negative['input_ids']
         self.model.scores = self.llamacpp_cache_negative['scores']
-        self.model.ctx = self.llamacpp_cache_negative['ctx']
+        self.model._ctx = self.llamacpp_cache_negative['ctx']
 
     @property
     def device(self) -> torch.device:
@@ -192,7 +192,6 @@ class LlamacppHF(PreTrainedModel):
         params = {
             'model_path': str(model_file),
             'n_ctx': shared.args.n_ctx,
-            'seed': int(shared.args.llama_cpp_seed),
             'n_threads': shared.args.threads or None,
             'n_threads_batch': shared.args.threads_batch or None,
             'n_batch': shared.args.n_batch,
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index 93f22e95..4f72c4eb 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -74,7 +74,6 @@ class LlamaCppModel:
         params = {
             'model_path': str(path),
             'n_ctx': shared.args.n_ctx,
-            'seed': int(shared.args.llama_cpp_seed),
             'n_threads': shared.args.threads or None,
             'n_threads_batch': shared.args.threads_batch or None,
             'n_batch': shared.args.n_batch,
@@ -144,15 +143,16 @@ class LlamaCppModel:
             max_tokens=state['max_new_tokens'],
             temperature=state['temperature'],
             top_p=state['top_p'],
-            top_k=state['top_k'],
-            repeat_penalty=state['repetition_penalty'],
-            presence_penalty=state['presence_penalty'],
             frequency_penalty=state['frequency_penalty'],
+            presence_penalty=state['presence_penalty'],
+            repeat_penalty=state['repetition_penalty'],
+            top_k=state['top_k'],
+            stream=True,
+            seed=int(state['seed']) if state['seed'] != -1 else None,
             tfs_z=state['tfs'],
             mirostat_mode=int(state['mirostat_mode']),
             mirostat_tau=state['mirostat_tau'],
             mirostat_eta=state['mirostat_eta'],
-            stream=True,
             logits_processor=logit_processors,
             grammar=self.grammar
         )
diff --git a/modules/loaders.py b/modules/loaders.py
index b3763f06..bf95a6f2 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -99,7 +99,6 @@ loaders_and_params = OrderedDict({
         'no_mmap',
         'mlock',
         'no_mul_mat_q',
-        'llama_cpp_seed',
         'alpha_value',
         'rope_freq_base',
         'compress_pos_emb',
@@ -366,6 +365,7 @@ loaders_samplers = {
         'repetition_penalty',
         'presence_penalty',
         'frequency_penalty',
+        'seed',
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
diff --git a/modules/shared.py b/modules/shared.py
index d40a1e77..54e72a6c 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -112,7 +112,6 @@ parser.add_argument('--no-mmap', action='store_true', help='Prevent mmap from be
 parser.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
 parser.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
 parser.add_argument('--tensor_split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 18,17.')
-parser.add_argument('--llama_cpp_seed', type=int, default=0, help='Seed for llama-cpp models. Default is 0 (random).')
 parser.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
 parser.add_argument('--logits_all', action='store_true', help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
 parser.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')
@@ -182,6 +181,7 @@ parser.add_argument('--no-stream', action='store_true', help='DEPRECATED')
 parser.add_argument('--mul_mat_q', action='store_true', help='DEPRECATED')
 parser.add_argument('--api-blocking-port', type=int, default=5000, help='DEPRECATED')
 parser.add_argument('--api-streaming-port', type=int, default=5005, help='DEPRECATED')
+parser.add_argument('--llama_cpp_seed', type=int, default=0, help='DEPRECATED')
 parser.add_argument('--use_fast', action='store_true', help='DEPRECATED')
 
 args = parser.parse_args()
diff --git a/modules/ui.py b/modules/ui.py
index de649668..383bc66f 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -80,7 +80,6 @@ def list_model_elements():
         'n_gpu_layers',
         'tensor_split',
         'n_ctx',
-        'llama_cpp_seed',
         'gpu_split',
         'max_seq_len',
         'compress_pos_emb',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 67396b78..12edeed9 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -120,7 +120,6 @@ def create_ui():
                             shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
                             shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
                             shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
-                            shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.', interactive=shared.args.trust_remote_code)
                             shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.')
                             shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
diff --git a/requirements.txt b/requirements.txt
index d8e4d9f2..6377723e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,14 +27,14 @@ bitsandbytes==0.41.1; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp311-cp311-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp310-cp310-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp39-cp39-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp38-cp38-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
 
 # CUDA wheels
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.5.1/auto_gptq-0.5.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
@@ -67,14 +67,14 @@ https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 913758bf..8f8f44ff 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -27,14 +27,14 @@ bitsandbytes==0.38.1; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.38.1-py3-none-win_amd64.whl; platform_system == "Windows"
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp311-cp311-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp310-cp310-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp39-cp39-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp38-cp38-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
 
 # AMD wheels
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.5.1/auto_gptq-0.5.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
@@ -45,10 +45,10 @@ https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+rocm5
 https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+rocm5.6-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
 https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+rocm5.6-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.11+rocm5.6.1-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.11+rocm5.6.1-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.11+rocm5.6.1-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.11+rocm5.6.1-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.18+rocm5.6.1-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.18+rocm5.6.1-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.18+rocm5.6.1-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.18+rocm5.6.1-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 0c18296c..455eba44 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -27,14 +27,14 @@ bitsandbytes==0.38.1; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.38.1-py3-none-win_amd64.whl; platform_system == "Windows"
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
 
 # AMD wheels
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.5.1/auto_gptq-0.5.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 83e1db0e..d5dfb525 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -27,19 +27,15 @@ bitsandbytes==0.41.1; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"
 
 # Mac wheels
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp311-cp311-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp310-cp310-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp39-cp39-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp38-cp38-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.8"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp39-cp39-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp38-cp38-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.8"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp311-cp311-macosx_13_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp310-cp310-macosx_13_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp39-cp39-macosx_13_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp38-cp38-macosx_13_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.8"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp39-cp39-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp38-cp38-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.8"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp311-cp311-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp310-cp310-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp39-cp39-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.9"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp38-cp38-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.8"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp39-cp39-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.9"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp38-cp38-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.8"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp39-cp39-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.9"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp38-cp38-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.8"
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 35760f63..6fe6a76a 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -27,19 +27,19 @@ bitsandbytes==0.41.1; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"
 
 # Mac wheels
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp311-cp311-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp310-cp310-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp39-cp39-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp38-cp38-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.8"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp39-cp39-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp38-cp38-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.8"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp39-cp39-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp38-cp38-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.8"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp39-cp39-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp38-cp38-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.8"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp311-cp311-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp310-cp310-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp39-cp39-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.9"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp38-cp38-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.8"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp39-cp39-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.9"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp38-cp38-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.8"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp39-cp39-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.9"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp38-cp38-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.8"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp39-cp39-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.9"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp38-cp38-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.8"
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index a3722d3a..56747a70 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -27,11 +27,11 @@ bitsandbytes==0.41.1; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp311-cp311-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp310-cp310-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp39-cp39-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp38-cp38-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 401ecd6a..412f974f 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -27,11 +27,11 @@ bitsandbytes==0.41.1; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 11f52509..32abd7d9 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -27,14 +27,14 @@ bitsandbytes==0.41.1; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
 
 # CUDA wheels
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.5.1/auto_gptq-0.5.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
@@ -67,14 +67,14 @@ https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121avx-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121avx-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121avx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121avx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121avx-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
-https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121avx-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121avx-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121avx-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121avx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121avx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121avx-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
+https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121avx-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"

From d1a58da52f0a49bacba5e0fcbe9b2a374eb40833 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 17 Nov 2023 19:52:30 -0800
Subject: [PATCH 02/18] Update ancient Docker instructions

---
 README.md                 |  2 +-
 docker/.dockerignore      |  1 -
 docker/.env.example       | 14 ++------------
 docker/Dockerfile         |  2 +-
 docker/docker-compose.yml |  1 -
 docs/09 - Docker.md       | 18 ++++++++++++------
 6 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 3726b26c..3ffaaf10 100644
--- a/README.md
+++ b/README.md
@@ -169,7 +169,7 @@ cp docker/.env.example .env
 docker compose up --build
 ```
 
-* You need to have docker compose v2.17 or higher installed. See [this guide](https://github.com/oobabooga/text-generation-webui/wiki/09-%E2%80%90-Docker) for instructions.
+* You need to have Docker Compose v2.17 or higher installed. See [this guide](https://github.com/oobabooga/text-generation-webui/wiki/09-%E2%80%90-Docker) for instructions.
 * For additional docker files, check out [this repository](https://github.com/Atinoda/text-generation-webui-docker).
 
 ### Updating the requirements
diff --git a/docker/.dockerignore b/docker/.dockerignore
index 6073533e..99d0adff 100644
--- a/docker/.dockerignore
+++ b/docker/.dockerignore
@@ -5,5 +5,4 @@ Dockerfile
 /models
 /presets
 /prompts
-/softprompts
 /training
diff --git a/docker/.env.example b/docker/.env.example
index 3119a9f0..b254f53b 100644
--- a/docker/.env.example
+++ b/docker/.env.example
@@ -3,13 +3,8 @@
 # https://developer.nvidia.com/cuda-gpus you can find the version for your card here
 TORCH_CUDA_ARCH_LIST=7.5
 
-# these commands worked for me with roughly 4.5GB of vram
-CLI_ARGS=--model llama-7b-4bit --wbits 4 --listen --auto-devices
-
-# the following examples have been tested with the files linked in docs/README_docker.md:
-# example running 13b with 4bit/128 groupsize        : CLI_ARGS=--model llama-13b-4bit-128g --wbits 4 --listen --groupsize 128 --pre_layer 25
-# example with loading api extension and public share: CLI_ARGS=--model llama-7b-4bit --wbits 4 --listen --auto-devices --no-stream --extensions api --share
-# example running 7b with 8bit groupsize             : CLI_ARGS=--model llama-7b --load-in-8bit --listen --auto-devices
+# your command-line flags go here:
+CLI_ARGS=
 
 # the port the webui binds to on the host
 HOST_PORT=7860
@@ -21,10 +16,5 @@ HOST_API_PORT=5000
 # the port the api binds to inside the container
 CONTAINER_API_PORT=5000
 
-# the port the api stream endpoint binds to on the host
-HOST_API_STREAM_PORT=5005
-# the port the api stream endpoint binds to inside the container
-CONTAINER_API_STREAM_PORT=5005
-
 # the version used to install text-generation-webui from
 WEBUI_VERSION=HEAD
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 722bc8fc..5752b2a7 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -73,5 +73,5 @@ RUN --mount=type=cache,target=/root/.cache/pip,rw \
 
 ENV CLI_ARGS=""
 
-EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
+EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000}
 CMD . /app/venv/bin/activate && python3 server.py ${CLI_ARGS}
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
index ce29f33b..29767d22 100644
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -11,7 +11,6 @@ services:
     ports:
       - "${HOST_PORT:-7860}:${CONTAINER_PORT:-7860}"
       - "${HOST_API_PORT:-5000}:${CONTAINER_API_PORT:-5000}"
-      - "${HOST_API_STREAM_PORT:-5005}:${CONTAINER_API_STREAM_PORT:-5005}"
     stdin_open: true
     tty: true
     volumes:
diff --git a/docs/09 - Docker.md b/docs/09 - Docker.md
index 921864bf..bddc5272 100644
--- a/docs/09 - Docker.md	
+++ b/docs/09 - Docker.md	
@@ -1,13 +1,21 @@
 Docker Compose is a way of installing and launching the web UI in an isolated Ubuntu image using only a few commands.
 
-In order to create the image as described in the main README, you must have docker compose 2.17 or higher:
+## Installing Docker Compose
+
+In order to create the image as described in the main README, you must have Docker Compose installed (2.17 or higher is recommended):
 
 ```
 ~$ docker compose version
-Docker Compose version v2.17.2
+Docker Compose version v2.21.0
 ```
 
-Make sure to also create the necessary symbolic links:
+The installation instructions for various Linux distributions can be found here:
+
+https://docs.docker.com/engine/install/ubuntu/#install-using-the-repository
+
+## Launching the image
+
+Use these commands to launch the image:
 
 ```
 cd text-generation-webui
@@ -17,13 +25,11 @@ cp docker/.env.example .env
 docker compose up --build
 ```
 
-## Table of contents
+## More detailed installation instructions
 
 * [Docker Compose installation instructions](#docker-compose-installation-instructions)
 * [Repository with additional Docker files](#dedicated-docker-repository)
 
-## Docker Compose installation instructions 
-
 By [@loeken](https://github.com/loeken).
 
 - [Ubuntu 22.04](#ubuntu-2204)

From 83b64e7fc186fbb5649252ea1aba0ae46e642cf3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 18 Nov 2023 18:31:41 -0300
Subject: [PATCH 03/18] New feature: "random preset" button (#4647)

---
 docs/03 - Parameters Tab.md |  8 +++++--
 modules/presets.py          | 43 +++++++++++++++++++++++++++++++++++++
 modules/ui_parameters.py    |  2 ++
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/docs/03 - Parameters Tab.md b/docs/03 - Parameters Tab.md
index a66fbbb8..601cca86 100644
--- a/docs/03 - Parameters Tab.md	
+++ b/docs/03 - Parameters Tab.md	
@@ -11,9 +11,13 @@ LLMs work by generating one token at a time. Given your prompt, the model calcul
 
 ### Preset menu
 
-Can be used to save combinations of parameters for reuse. 
+Can be used to save and load combinations of parameters for reuse.
 
-The built-in presets were not manually chosen. They were obtained after a blind contest called "Preset Arena" where hundreds of people voted. The full results can be found [here](https://github.com/oobabooga/oobabooga.github.io/blob/main/arena/results.md).
+* **🎲 button**: creates a random yet interpretable preset. Only 1 parameter of each category is included for the categories: removing tail tokens, avoiding repetition, and flattening the distribution. That is, top_p and top_k are not mixed, and neither are repetition_penalty and frequency_penalty. You can use this button to break out of a loop of bad generations after multiple "Regenerate" attempts.
+
+#### Built-in presets
+
+These were obtained after a blind contest called "Preset Arena" where hundreds of people voted. The full results can be found [here](https://github.com/oobabooga/oobabooga.github.io/blob/main/arena/results.md).
 
 A key takeaway is that the best presets are:
 
diff --git a/modules/presets.py b/modules/presets.py
index 5082678b..842992f9 100644
--- a/modules/presets.py
+++ b/modules/presets.py
@@ -1,8 +1,12 @@
 import functools
+import random
 from pathlib import Path
 
 import yaml
 
+from modules import shared
+from modules.loaders import loaders_samplers
+
 
 def default_preset():
     return {
@@ -63,6 +67,45 @@ def load_preset_for_ui(name, state):
     return state, *[generate_params[k] for k in presets_params()]
 
 
+def random_preset(state):
+    params_and_values = {
+        'remove_tail_tokens': {
+            'top_p': [0.5, 0.8, 0.9, 0.95, 0.99],
+            'min_p': [0.5, 0.2, 0.1, 0.05, 0.01],
+            'top_k': [3, 5, 10, 20, 30, 40],
+            'typical_p': [0.2, 0.575, 0.95],
+            'tfs': [0.5, 0.8, 0.9, 0.95, 0.99],
+            'top_a': [0.5, 0.2, 0.1, 0.05, 0.01],
+            'epsilon_cutoff': [1, 3, 5, 7, 9],
+            'eta_cutoff': [3, 6, 9, 12, 15, 18],
+        },
+        'flatten_distribution': {
+            'temperature': [0.5, 0.7, 0.8, 1, 1.2, 1.5, 2.0],
+        },
+        'repetition': {
+            'repetition_penalty': [1, 1.05, 1.1, 1.15, 1.20, 1.25],
+            'presence_penalty': [0, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 2.0],
+            'frequency_penalty': [0, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 2.0],
+        },
+        'other': {
+            'temperature_last': [True, False],
+        }
+    }
+
+    generate_params = default_preset()
+    for cat in params_and_values:
+        choices = list(params_and_values[cat].keys())
+        if shared.args.loader is not None:
+            choices = [x for x in choices if x in loaders_samplers[shared.args.loader]]
+
+        if len(choices) > 0:
+            choice = random.choice(choices)
+            generate_params[choice] = random.choice(params_and_values[cat][choice])
+
+    state.update(generate_params)
+    return state, *[generate_params[k] for k in presets_params()]
+
+
 def generate_preset_yaml(state):
     defaults = default_preset()
     data = {k: state[k] for k in presets_params()}
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index fa245c4d..0c53963e 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -18,6 +18,7 @@ def create_ui(default_preset):
                         ui.create_refresh_button(shared.gradio['preset_menu'], lambda: None, lambda: {'choices': utils.get_available_presets()}, 'refresh-button', interactive=not mu)
                         shared.gradio['save_preset'] = gr.Button('💾', elem_classes='refresh-button', interactive=not mu)
                         shared.gradio['delete_preset'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
+                        shared.gradio['random_preset'] = gr.Button('🎲', elem_classes='refresh-button')
 
                 with gr.Column():
                     shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()), value="All", elem_classes='slim-dropdown')
@@ -90,6 +91,7 @@ def create_ui(default_preset):
 def create_event_handlers():
     shared.gradio['filter_by_loader'].change(loaders.blacklist_samplers, gradio('filter_by_loader'), gradio(loaders.list_all_samplers()), show_progress=False)
     shared.gradio['preset_menu'].change(presets.load_preset_for_ui, gradio('preset_menu', 'interface_state'), gradio('interface_state') + gradio(presets.presets_params()))
+    shared.gradio['random_preset'].click(presets.random_preset, gradio('interface_state'), gradio('interface_state') + gradio(presets.presets_params()))
     shared.gradio['grammar_file'].change(load_grammar, gradio('grammar_file'), gradio('grammar_string'))
 
 

From 47d9e2618bd2c4edfbdd13376a6c1b0d5575da4d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 18 Nov 2023 14:03:42 -0800
Subject: [PATCH 04/18] Refresh the Preset menu after saving a preset

---
 modules/ui_file_saving.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/modules/ui_file_saving.py b/modules/ui_file_saving.py
index 1625c830..39ab41d4 100644
--- a/modules/ui_file_saving.py
+++ b/modules/ui_file_saving.py
@@ -37,6 +37,14 @@ def create_ui():
             shared.gradio['delete_character_confirm'] = gr.Button('Delete', elem_classes="small-button", variant='stop', interactive=not mu)
             shared.gradio['delete_character_cancel'] = gr.Button('Cancel', elem_classes="small-button")
 
+    # Preset saver
+    with gr.Group(visible=False, elem_classes='file-saver') as shared.gradio['preset_saver']:
+        shared.gradio['save_preset_filename'] = gr.Textbox(lines=1, label='File name', info='The preset will be saved to your presets/ folder with this base filename.')
+        shared.gradio['save_preset_contents'] = gr.Textbox(lines=10, label='File contents')
+        with gr.Row():
+            shared.gradio['save_preset_confirm'] = gr.Button('Save', elem_classes="small-button", variant='primary', interactive=not mu)
+            shared.gradio['save_preset_cancel'] = gr.Button('Cancel', elem_classes="small-button")
+
 
 def create_event_handlers():
     shared.gradio['save_confirm'].click(
@@ -65,10 +73,16 @@ def create_event_handlers():
 
     shared.gradio['save_preset'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        presets.generate_preset_yaml, gradio('interface_state'), gradio('save_contents')).then(
-        lambda: 'presets/', None, gradio('save_root')).then(
-        lambda: 'My Preset.yaml', None, gradio('save_filename')).then(
-        lambda: gr.update(visible=True), None, gradio('file_saver'))
+        presets.generate_preset_yaml, gradio('interface_state'), gradio('save_preset_contents')).then(
+        lambda: 'My Preset', None, gradio('save_preset_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('preset_saver'))
+
+    shared.gradio['save_preset_confirm'].click(
+        lambda x, y: utils.save_file(f'presets/{x}.yaml', y), gradio('save_preset_filename', 'save_preset_contents'), None).then(
+        lambda: gr.update(visible=False), None, gradio('preset_saver')).then(
+        lambda x: gr.update(choices=utils.get_available_presets(), value=x), gradio('save_preset_filename'), gradio('preset_menu'))
+
+    shared.gradio['save_preset_cancel'].click(lambda: gr.update(visible=False), None, gradio('preset_saver'))
 
     shared.gradio['delete_preset'].click(
         lambda x: f'{x}.yaml', gradio('preset_menu'), gradio('delete_filename')).then(

From baab89475989a4f7166b233339e90cb0b9a3cfee Mon Sep 17 00:00:00 2001
From: Jordan Tucker <jordanbtucker@gmail.com>
Date: Sat, 18 Nov 2023 17:20:13 -0600
Subject: [PATCH 05/18] fix: use system message in chat-instruct mode (#4648)

---
 modules/chat.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index 4c518d33..dda16749 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -91,7 +91,12 @@ def generate_chat_prompt(user_input, state, **kwargs):
     if state['mode'] == 'chat-instruct':
         wrapper = ''
         command = state['chat-instruct_command'].replace('<|character|>', state['name2'] if not impersonate else state['name1'])
-        wrapper += state['context_instruct']
+        context_instruct = state['context_instruct']
+        if state['custom_system_message'].strip() != '':
+            context_instruct = context_instruct.replace('<|system-message|>', state['custom_system_message'])
+        else:
+            context_instruct = context_instruct.replace('<|system-message|>', state['system_message'])
+        wrapper += context_instruct
         wrapper += all_substrings['instruct']['user_turn'].replace('<|user-message|>', command)
         wrapper += all_substrings['instruct']['bot_turn_stripped']
         if impersonate:

From af76fbedb880f380953cf8ebd95d4a6b11eed1ef Mon Sep 17 00:00:00 2001
From: wizd <rcvbuf@gmail.com>
Date: Sun, 19 Nov 2023 07:24:29 +0800
Subject: [PATCH 06/18] Openai embedding fix to support jina-embeddings-v2
 (#4642)

---
 extensions/openai/embeddings.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/extensions/openai/embeddings.py b/extensions/openai/embeddings.py
index fcdaab63..1420879c 100644
--- a/extensions/openai/embeddings.py
+++ b/extensions/openai/embeddings.py
@@ -1,6 +1,7 @@
 import os
 
 import numpy as np
+from transformers import AutoModel
 
 from extensions.openai.errors import ServiceUnavailableError
 from extensions.openai.utils import debug_msg, float_list_to_base64
@@ -41,7 +42,12 @@ def load_embedding_model(model: str):
     global embeddings_device, embeddings_model
     try:
         print(f"Try embedding model: {model} on {embeddings_device}")
-        embeddings_model = SentenceTransformer(model, device=embeddings_device)
+        if 'jina-embeddings' in model:
+            embeddings_model = AutoModel.from_pretrained(model, trust_remote_code=True)  # trust_remote_code is needed to use the encode method
+            embeddings_model = embeddings_model.to(embeddings_device)
+        else:
+            embeddings_model = SentenceTransformer(model, device=embeddings_device)
+
         print(f"Loaded embedding model: {model}")
     except Exception as e:
         embeddings_model = None

From 8f4f4daf8bb7f17bff8e2813053f1aca45e85d8a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 18 Nov 2023 22:33:27 -0300
Subject: [PATCH 07/18] Add --admin-key flag for API (#4649)

---
 README.md                   |  1 +
 extensions/openai/script.py | 50 ++++++++++++++++++++++++-------------
 modules/shared.py           |  1 +
 3 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 3ffaaf10..8c2679cf 100644
--- a/README.md
+++ b/README.md
@@ -413,6 +413,7 @@ Optionally, you can use the following command-line flags:
 | `--public-api-id PUBLIC_API_ID`       | Tunnel ID for named Cloudflare Tunnel. Use together with public-api option. |
 | `--api-port API_PORT`                 | The listening port for the API. |
 | `--api-key API_KEY`                   | API authentication key. |
+| `--admin-key ADMIN_KEY`               | API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key. |
 
 #### Multimodal
 
diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index 2128444e..43d4b261 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -60,7 +60,15 @@ def verify_api_key(authorization: str = Header(None)) -> None:
         raise HTTPException(status_code=401, detail="Unauthorized")
 
 
-app = FastAPI(dependencies=[Depends(verify_api_key)])
+def verify_admin_key(authorization: str = Header(None)) -> None:
+    expected_api_key = shared.args.admin_key
+    if expected_api_key and (authorization is None or authorization != f"Bearer {expected_api_key}"):
+        raise HTTPException(status_code=401, detail="Unauthorized")
+
+
+app = FastAPI()
+check_key = [Depends(verify_api_key)]
+check_admin_key = [Depends(verify_admin_key)]
 
 # Configure CORS settings to allow all origins, methods, and headers
 app.add_middleware(
@@ -72,12 +80,12 @@ app.add_middleware(
 )
 
 
-@app.options("/")
+@app.options("/", dependencies=check_key)
 async def options_route():
     return JSONResponse(content="OK")
 
 
-@app.post('/v1/completions', response_model=CompletionResponse)
+@app.post('/v1/completions', response_model=CompletionResponse, dependencies=check_key)
 async def openai_completions(request: Request, request_data: CompletionRequest):
     path = request.url.path
     is_legacy = "/generate" in path
@@ -100,7 +108,7 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
         return JSONResponse(response)
 
 
-@app.post('/v1/chat/completions', response_model=ChatCompletionResponse)
+@app.post('/v1/chat/completions', response_model=ChatCompletionResponse, dependencies=check_key)
 async def openai_chat_completions(request: Request, request_data: ChatCompletionRequest):
     path = request.url.path
     is_legacy = "/generate" in path
@@ -123,8 +131,8 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
         return JSONResponse(response)
 
 
-@app.get("/v1/models")
-@app.get("/v1/models/{model}")
+@app.get("/v1/models", dependencies=check_key)
+@app.get("/v1/models/{model}", dependencies=check_key)
 async def handle_models(request: Request):
     path = request.url.path
     is_list = request.url.path.split('?')[0].split('#')[0] == '/v1/models'
@@ -138,7 +146,7 @@ async def handle_models(request: Request):
     return JSONResponse(response)
 
 
-@app.get('/v1/billing/usage')
+@app.get('/v1/billing/usage', dependencies=check_key)
 def handle_billing_usage():
     '''
     Ex. /v1/dashboard/billing/usage?start_date=2023-05-01&end_date=2023-05-31
@@ -146,7 +154,7 @@ def handle_billing_usage():
     return JSONResponse(content={"total_usage": 0})
 
 
-@app.post('/v1/audio/transcriptions')
+@app.post('/v1/audio/transcriptions', dependencies=check_key)
 async def handle_audio_transcription(request: Request):
     r = sr.Recognizer()
 
@@ -176,7 +184,7 @@ async def handle_audio_transcription(request: Request):
     return JSONResponse(content=transcription)
 
 
-@app.post('/v1/images/generations')
+@app.post('/v1/images/generations', dependencies=check_key)
 async def handle_image_generation(request: Request):
 
     if not os.environ.get('SD_WEBUI_URL', params.get('sd_webui_url', '')):
@@ -192,7 +200,7 @@ async def handle_image_generation(request: Request):
     return JSONResponse(response)
 
 
-@app.post("/v1/embeddings", response_model=EmbeddingsResponse)
+@app.post("/v1/embeddings", response_model=EmbeddingsResponse, dependencies=check_key)
 async def handle_embeddings(request: Request, request_data: EmbeddingsRequest):
     input = request_data.input
     if not input:
@@ -205,7 +213,7 @@ async def handle_embeddings(request: Request, request_data: EmbeddingsRequest):
     return JSONResponse(response)
 
 
-@app.post("/v1/moderations")
+@app.post("/v1/moderations", dependencies=check_key)
 async def handle_moderations(request: Request):
     body = await request.json()
     input = body["input"]
@@ -216,37 +224,37 @@ async def handle_moderations(request: Request):
     return JSONResponse(response)
 
 
-@app.post("/v1/internal/encode", response_model=EncodeResponse)
+@app.post("/v1/internal/encode", response_model=EncodeResponse, dependencies=check_key)
 async def handle_token_encode(request_data: EncodeRequest):
     response = token_encode(request_data.text)
     return JSONResponse(response)
 
 
-@app.post("/v1/internal/decode", response_model=DecodeResponse)
+@app.post("/v1/internal/decode", response_model=DecodeResponse, dependencies=check_key)
 async def handle_token_decode(request_data: DecodeRequest):
     response = token_decode(request_data.tokens)
     return JSONResponse(response)
 
 
-@app.post("/v1/internal/token-count", response_model=TokenCountResponse)
+@app.post("/v1/internal/token-count", response_model=TokenCountResponse, dependencies=check_key)
 async def handle_token_count(request_data: EncodeRequest):
     response = token_count(request_data.text)
     return JSONResponse(response)
 
 
-@app.post("/v1/internal/stop-generation")
+@app.post("/v1/internal/stop-generation", dependencies=check_key)
 async def handle_stop_generation(request: Request):
     stop_everything_event()
     return JSONResponse(content="OK")
 
 
-@app.get("/v1/internal/model/info", response_model=ModelInfoResponse)
+@app.get("/v1/internal/model/info", response_model=ModelInfoResponse, dependencies=check_key)
 async def handle_model_info():
     payload = OAImodels.get_current_model_info()
     return JSONResponse(content=payload)
 
 
-@app.post("/v1/internal/model/load")
+@app.post("/v1/internal/model/load", dependencies=check_admin_key)
 async def handle_load_model(request_data: LoadModelRequest):
     '''
     This endpoint is experimental and may change in the future.
@@ -283,7 +291,7 @@ async def handle_load_model(request_data: LoadModelRequest):
         return HTTPException(status_code=400, detail="Failed to load the model.")
 
 
-@app.post("/v1/internal/model/unload")
+@app.post("/v1/internal/model/unload", dependencies=check_admin_key)
 async def handle_unload_model():
     unload_model()
     return JSONResponse(content="OK")
@@ -308,8 +316,14 @@ def run_server():
             logger.info(f'OpenAI-compatible API URL:\n\nhttp://{server_addr}:{port}\n')
 
     if shared.args.api_key:
+        if not shared.args.admin_key:
+            shared.args.admin_key = shared.args.api_key
+
         logger.info(f'OpenAI API key:\n\n{shared.args.api_key}\n')
 
+    if shared.args.admin_key:
+        logger.info(f'OpenAI API admin key (for loading/unloading models):\n\n{shared.args.admin_key}\n')
+
     uvicorn.run(app, host=server_addr, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)
 
 
diff --git a/modules/shared.py b/modules/shared.py
index 54e72a6c..b139a2cf 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -170,6 +170,7 @@ parser.add_argument('--public-api', action='store_true', help='Create a public U
 parser.add_argument('--public-api-id', type=str, help='Tunnel ID for named Cloudflare Tunnel. Use together with public-api option.', default=None)
 parser.add_argument('--api-port', type=int, default=5000, help='The listening port for the API.')
 parser.add_argument('--api-key', type=str, default='', help='API authentication key.')
+parser.add_argument('--admin-key', type=str, default='', help='API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.')
 
 # Multimodal
 parser.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.')

From 0fa1af296c18854722c348ae068e52551f6efe49 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 18 Nov 2023 23:19:31 -0300
Subject: [PATCH 08/18] Add /v1/internal/logits endpoint (#4650)

---
 docs/12 - OpenAI API.md     | 23 +++++++++++++++++++++++
 extensions/openai/script.py | 13 +++++++++++++
 extensions/openai/typing.py | 26 ++++++++++++++++++++++----
 modules/logits.py           | 18 +++++++++++++-----
 4 files changed, 71 insertions(+), 9 deletions(-)

diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index 05b4db02..abbd432d 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -97,6 +97,29 @@ curl http://127.0.0.1:5000/v1/chat/completions \
   }'
 ```
 
+#### Logits
+
+```
+curl -k http://127.0.0.1:5000/v1/internal/logits \
+  -H "Content-Type: application/json" \
+  -d '{
+    "prompt": "Who is best, Asuka or Rei? Answer:",
+    "use_samplers": false
+  }'
+```
+
+#### Logits after sampling parameters
+
+```
+curl -k http://127.0.0.1:5000/v1/internal/logits \
+  -H "Content-Type: application/json" \
+  -d '{
+    "prompt": "Who is best, Asuka or Rei? Answer:",
+    "use_samplers": true,
+    "top_k": 3
+  }'
+```
+
 #### Python chat example
 
 ```python
diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index 43d4b261..da56287c 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -16,6 +16,7 @@ from sse_starlette import EventSourceResponse
 import extensions.openai.completions as OAIcompletions
 import extensions.openai.embeddings as OAIembeddings
 import extensions.openai.images as OAIimages
+import extensions.openai.logits as OAIlogits
 import extensions.openai.models as OAImodels
 import extensions.openai.moderations as OAImoderations
 from extensions.openai.errors import ServiceUnavailableError
@@ -38,6 +39,8 @@ from .typing import (
     EncodeRequest,
     EncodeResponse,
     LoadModelRequest,
+    LogitsRequest,
+    LogitsResponse,
     ModelInfoResponse,
     TokenCountResponse,
     to_dict
@@ -242,6 +245,16 @@ async def handle_token_count(request_data: EncodeRequest):
     return JSONResponse(response)
 
 
+@app.post("/v1/internal/logits", response_model=LogitsResponse, dependencies=check_key)
+async def handle_logits(request_data: LogitsRequest):
+    '''
+    Given a prompt, returns the top 50 most likely logits as a dict.
+    The keys are the tokens, and the values are the probabilities.
+    '''
+    response = OAIlogits._get_next_logits(to_dict(request_data))
+    return JSONResponse(response)
+
+
 @app.post("/v1/internal/stop-generation", dependencies=check_key)
 async def handle_stop_generation(request: Request):
     stop_everything_event()
diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index ee8f2ac6..05d3f753 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -126,15 +126,15 @@ class EncodeRequest(BaseModel):
     text: str
 
 
-class DecodeRequest(BaseModel):
-    tokens: List[int]
-
-
 class EncodeResponse(BaseModel):
     tokens: List[int]
     length: int
 
 
+class DecodeRequest(BaseModel):
+    tokens: List[int]
+
+
 class DecodeResponse(BaseModel):
     text: str
 
@@ -143,6 +143,24 @@ class TokenCountResponse(BaseModel):
     length: int
 
 
+class LogitsRequestParams(BaseModel):
+    prompt: str
+    use_samplers: bool = False
+    frequency_penalty: float | None = 0
+    max_tokens: int | None = 16
+    presence_penalty: float | None = 0
+    temperature: float | None = 1
+    top_p: float | None = 1
+
+
+class LogitsRequest(GenerationOptions, LogitsRequestParams):
+    pass
+
+
+class LogitsResponse(BaseModel):
+    logits: dict
+
+
 class ModelInfoResponse(BaseModel):
     model_name: str
     lora_names: List[str]
diff --git a/modules/logits.py b/modules/logits.py
index e356a986..383659e0 100644
--- a/modules/logits.py
+++ b/modules/logits.py
@@ -8,7 +8,7 @@ from modules.text_generation import generate_reply
 global_scores = None
 
 
-def get_next_logits(prompt, state, use_samplers, previous):
+def get_next_logits(prompt, state, use_samplers, previous, return_dict=False):
     if shared.model is None:
         logger.error("No model is loaded! Select one in the Model tab.")
         return 'Error: No model is loaded1 Select one in the Model tab.', previous
@@ -56,8 +56,16 @@ def get_next_logits(prompt, state, use_samplers, previous):
         topk_indices = [i.expand((1, 1)) for i in topk_indices]
 
     tokens = [shared.tokenizer.decode(i) for i in topk_indices]
-    output = ''
-    for row in list(zip(topk_values, tokens)):
-        output += f"{row[0]}  -  {repr(row[1])}\n"
 
-    return output, previous
+    if return_dict:
+        output = {}
+        for row in list(zip(topk_values, tokens)):
+            output[row[1]] = row[0]
+
+        return output
+    else:
+        output = ''
+        for row in list(zip(topk_values, tokens)):
+            output += f"{row[0]}  -  {repr(row[1])}\n"
+
+        return output, previous

From ef6feedeb22c6c3d045f491200bd7237735b9f78 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 18 Nov 2023 23:38:39 -0300
Subject: [PATCH 09/18] Add --nowebui flag for pure API mode (#4651)

---
 README.md                   |  1 +
 extensions/openai/script.py |  5 ++++-
 modules/shared.py           |  3 ++-
 server.py                   | 24 +++++++++++++++---------
 4 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 8c2679cf..56e810b2 100644
--- a/README.md
+++ b/README.md
@@ -414,6 +414,7 @@ Optionally, you can use the following command-line flags:
 | `--api-port API_PORT`                 | The listening port for the API. |
 | `--api-key API_KEY`                   | API authentication key. |
 | `--admin-key ADMIN_KEY`               | API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key. |
+| `--nowebui`                           | Do not launch the Gradio UI. Useful for launching the API in standalone mode. |
 
 #### Multimodal
 
diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index da56287c..a516b0f7 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -341,4 +341,7 @@ def run_server():
 
 
 def setup():
-    Thread(target=run_server, daemon=True).start()
+    if shared.args.nowebui:
+        run_server()
+    else:
+        Thread(target=run_server, daemon=True).start()
diff --git a/modules/shared.py b/modules/shared.py
index b139a2cf..344daf1d 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -171,6 +171,7 @@ parser.add_argument('--public-api-id', type=str, help='Tunnel ID for named Cloud
 parser.add_argument('--api-port', type=int, default=5000, help='The listening port for the API.')
 parser.add_argument('--api-key', type=str, default='', help='API authentication key.')
 parser.add_argument('--admin-key', type=str, default='', help='API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.')
+parser.add_argument('--nowebui', action='store_true', help='Do not launch the Gradio UI. Useful for launching the API in standalone mode.')
 
 # Multimodal
 parser.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.')
@@ -201,7 +202,7 @@ for k in ['notebook', 'chat', 'no_stream', 'mul_mat_q', 'use_fast']:
 # Security warnings
 if args.trust_remote_code:
     logger.warning('trust_remote_code is enabled. This is dangerous.')
-if 'COLAB_GPU' not in os.environ:
+if 'COLAB_GPU' not in os.environ and not args.nowebui:
     if args.share:
         logger.warning("The gradio \"share link\" feature uses a proprietary executable to create a reverse tunnel. Use it with care.")
     if any((args.listen, args.share)) and not any((args.gradio_auth, args.gradio_auth_path)):
diff --git a/server.py b/server.py
index e9605e3b..cdd82e1d 100644
--- a/server.py
+++ b/server.py
@@ -226,13 +226,19 @@ if __name__ == "__main__":
 
     shared.generation_lock = Lock()
 
-    # Launch the web UI
-    create_interface()
-    while True:
-        time.sleep(0.5)
-        if shared.need_restart:
-            shared.need_restart = False
+    if shared.args.nowebui:
+        # Start the API in standalone mode
+        shared.args.extensions = [x for x in shared.args.extensions if x != 'gallery']
+        if shared.args.extensions is not None and len(shared.args.extensions) > 0:
+            extensions_module.load_extensions()
+    else:
+        # Launch the web UI
+        create_interface()
+        while True:
             time.sleep(0.5)
-            shared.gradio['interface'].close()
-            time.sleep(0.5)
-            create_interface()
+            if shared.need_restart:
+                shared.need_restart = False
+                time.sleep(0.5)
+                shared.gradio['interface'].close()
+                time.sleep(0.5)
+                create_interface()

From 771e62e4764260dd526a6e3386048a208bf12f87 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 19 Nov 2023 00:35:22 -0300
Subject: [PATCH 10/18] Add /v1/internal/lora endpoints (#4652)

---
 extensions/openai/models.py | 29 ++++++++++++++++++++---------
 extensions/openai/script.py | 32 +++++++++++++++++++++++++++++++-
 extensions/openai/typing.py | 30 +++++++++++++++++++++---------
 3 files changed, 72 insertions(+), 19 deletions(-)

diff --git a/extensions/openai/models.py b/extensions/openai/models.py
index 1ff950a2..8a093ebe 100644
--- a/extensions/openai/models.py
+++ b/extensions/openai/models.py
@@ -1,8 +1,9 @@
 from modules import shared
 from modules.logging_colors import logger
+from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model
 from modules.models_settings import get_model_metadata, update_model_parameters
-from modules.utils import get_available_models
+from modules.utils import get_available_loras, get_available_models
 
 
 def get_current_model_info():
@@ -13,12 +14,17 @@ def get_current_model_info():
 
 
 def list_models():
+    return {'model_names': get_available_models()[1:]}
+
+
+def list_dummy_models():
     result = {
         "object": "list",
         "data": []
     }
 
-    for model in get_dummy_models() + get_available_models()[1:]:
+    # these are expected by so much, so include some here as a dummy
+    for model in ['gpt-3.5-turbo', 'text-embedding-ada-002']:
         result["data"].append(model_info_dict(model))
 
     return result
@@ -33,13 +39,6 @@ def model_info_dict(model_name: str) -> dict:
     }
 
 
-def get_dummy_models() -> list:
-    return [  # these are expected by so much, so include some here as a dummy
-        'gpt-3.5-turbo',
-        'text-embedding-ada-002',
-    ]
-
-
 def _load_model(data):
     model_name = data["model_name"]
     args = data["args"]
@@ -67,3 +66,15 @@ def _load_model(data):
                     logger.info(f"TRUNCATION LENGTH (UPDATED): {shared.settings['truncation_length']}")
                 elif k == 'instruction_template':
                     logger.info(f"INSTRUCTION TEMPLATE (UPDATED): {shared.settings['instruction_template']}")
+
+
+def list_loras():
+    return {'lora_names': get_available_loras()[1:]}
+
+
+def load_loras(lora_names):
+    add_lora_to_model(lora_names)
+
+
+def unload_all_loras():
+    add_lora_to_model([])
diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index a516b0f7..047c339a 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -38,10 +38,13 @@ from .typing import (
     EmbeddingsResponse,
     EncodeRequest,
     EncodeResponse,
+    LoadLorasRequest,
     LoadModelRequest,
     LogitsRequest,
     LogitsResponse,
+    LoraListResponse,
     ModelInfoResponse,
+    ModelListResponse,
     TokenCountResponse,
     to_dict
 )
@@ -141,7 +144,7 @@ async def handle_models(request: Request):
     is_list = request.url.path.split('?')[0].split('#')[0] == '/v1/models'
 
     if is_list:
-        response = OAImodels.list_models()
+        response = OAImodels.list_dummy_models()
     else:
         model_name = path[len('/v1/models/'):]
         response = OAImodels.model_info_dict(model_name)
@@ -267,6 +270,12 @@ async def handle_model_info():
     return JSONResponse(content=payload)
 
 
+@app.get("/v1/internal/model/list", response_model=ModelListResponse, dependencies=check_admin_key)
+async def handle_list_models():
+    payload = OAImodels.list_models()
+    return JSONResponse(content=payload)
+
+
 @app.post("/v1/internal/model/load", dependencies=check_admin_key)
 async def handle_load_model(request_data: LoadModelRequest):
     '''
@@ -307,6 +316,27 @@ async def handle_load_model(request_data: LoadModelRequest):
 @app.post("/v1/internal/model/unload", dependencies=check_admin_key)
 async def handle_unload_model():
     unload_model()
+
+
+@app.get("/v1/internal/lora/list", response_model=LoraListResponse, dependencies=check_admin_key)
+async def handle_list_loras():
+    response = OAImodels.list_loras()
+    return JSONResponse(content=response)
+
+
+@app.post("/v1/internal/lora/load", dependencies=check_admin_key)
+async def handle_load_loras(request_data: LoadLorasRequest):
+    try:
+        OAImodels.load_loras(request_data.lora_names)
+        return JSONResponse(content="OK")
+    except:
+        traceback.print_exc()
+        return HTTPException(status_code=400, detail="Failed to apply the LoRA(s).")
+
+
+@app.post("/v1/internal/lora/unload", dependencies=check_admin_key)
+async def handle_unload_loras():
+    OAImodels.unload_all_loras()
     return JSONResponse(content="OK")
 
 
diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index 05d3f753..5a2d40d5 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -122,6 +122,19 @@ class ChatCompletionResponse(BaseModel):
     usage: dict
 
 
+class EmbeddingsRequest(BaseModel):
+    input: str | List[str]
+    model: str | None = Field(default=None, description="Unused parameter. To change the model, set the OPENEDAI_EMBEDDING_MODEL and OPENEDAI_EMBEDDING_DEVICE environment variables before starting the server.")
+    encoding_format: str = Field(default="float", description="Can be float or base64.")
+    user: str | None = Field(default=None, description="Unused parameter.")
+
+
+class EmbeddingsResponse(BaseModel):
+    index: int
+    embedding: List[float]
+    object: str = "embedding"
+
+
 class EncodeRequest(BaseModel):
     text: str
 
@@ -166,23 +179,22 @@ class ModelInfoResponse(BaseModel):
     lora_names: List[str]
 
 
+class ModelListResponse(BaseModel):
+    model_names: List[str]
+
+
 class LoadModelRequest(BaseModel):
     model_name: str
     args: dict | None = None
     settings: dict | None = None
 
 
-class EmbeddingsRequest(BaseModel):
-    input: str | List[str]
-    model: str | None = Field(default=None, description="Unused parameter. To change the model, set the OPENEDAI_EMBEDDING_MODEL and OPENEDAI_EMBEDDING_DEVICE environment variables before starting the server.")
-    encoding_format: str = Field(default="float", description="Can be float or base64.")
-    user: str | None = Field(default=None, description="Unused parameter.")
+class LoraListResponse(BaseModel):
+    lora_names: List[str]
 
 
-class EmbeddingsResponse(BaseModel):
-    index: int
-    embedding: List[float]
-    object: str = "embedding"
+class LoadLorasRequest(BaseModel):
+    lora_names: List[str]
 
 
 def to_json(obj):

From cb836dd49c0e24b304455a496b63994329781ef8 Mon Sep 17 00:00:00 2001
From: Jordan Tucker <jordanbtucker@gmail.com>
Date: Sat, 18 Nov 2023 22:19:10 -0600
Subject: [PATCH 11/18] fix: use shared chat-instruct_command with api (#4653)

---
 extensions/openai/completions.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 99525b66..389466ff 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -203,6 +203,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False) -
     turn_template = body['turn_template'] or turn_template
     context_instruct = body['context_instruct'] or context_instruct
     system_message = body['system_message'] or system_message
+    chat_instruct_command = body['chat_instruct_command'] or shared.settings['chat-instruct_command']
 
     # Chat character
     character = body['character'] or shared.settings['character']
@@ -228,7 +229,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False) -
         'system_message': system_message,
         'custom_system_message': custom_system_message,
         'turn_template': turn_template,
-        'chat-instruct_command': body['chat_instruct_command'],
+        'chat-instruct_command': chat_instruct_command,
         'history': history,
         'stream': stream
     })

From 5fcee696ea3b0cc553a39213cdb5fbc2da0314c2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 19 Nov 2023 02:05:17 -0300
Subject: [PATCH 12/18] New feature: enlarge character pictures on click
 (#4654)

---
 css/main.css              | 16 ++++++++++++++++
 js/main.js                | 39 +++++++++++++++++++++++++++++++++++++++
 js/update_big_picture.js  |  7 +++++++
 modules/chat.py           | 15 ++++++++++-----
 modules/html_generator.py |  2 +-
 modules/ui.py             |  2 ++
 modules/ui_chat.py        |  3 ++-
 7 files changed, 77 insertions(+), 7 deletions(-)
 create mode 100644 js/update_big_picture.js

diff --git a/css/main.css b/css/main.css
index 47506c5e..b06d809d 100644
--- a/css/main.css
+++ b/css/main.css
@@ -648,3 +648,19 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 .options {
     z-index: 100 !important;
 }
+
+/* ----------------------------------------------
+  Big profile picture for characters
+---------------------------------------------- */
+.bigProfilePicture {
+    position: fixed;
+    bottom: 0;
+    left: 0;
+    width: calc((100vw - 880px - 120px) /2);
+}
+
+@media screen and (width <= 1300px) {
+    .bigProfilePicture {
+        display: none;
+    }
+}
diff --git a/js/main.js b/js/main.js
index 17d1d354..2e0c765b 100644
--- a/js/main.js
+++ b/js/main.js
@@ -312,6 +312,10 @@ document.addEventListener("click", function (event) {
   if (!isMouseOverButtonOrMenu() && menu.style.display === "flex") {
     hideMenu();
   }
+
+  if (event.target.classList.contains("pfp_character")) {
+    toggleBigPicture();
+  }
 });
 
 //------------------------------------------------
@@ -335,3 +339,38 @@ document.getElementById("show-controls").parentNode.style.bottom = "0px";
 // Focus on the chat input
 //------------------------------------------------
 document.querySelector("#chat-input textarea").focus();
+
+//------------------------------------------------
+// Show enlarged character picture when the profile
+// picture is clicked on
+//------------------------------------------------
+let bigPictureVisible = false;
+
+function addBigPicture() {
+  var imgElement = document.createElement("img");
+  var timestamp = new Date().getTime();
+  imgElement.src = "/file/cache/pfp_character.png?time=" + timestamp;
+  imgElement.classList.add("bigProfilePicture");
+
+  var imgElementParent = document.getElementById("chat").parentNode.parentNode.parentNode.parentNode.parentNode.parentNode.parentNode;
+  imgElementParent.appendChild(imgElement);
+}
+
+function deleteBigPicture() {
+  var bigProfilePictures = document.querySelectorAll('.bigProfilePicture');
+  bigProfilePictures.forEach(function (element) {
+    element.parentNode.removeChild(element);
+  });
+}
+
+function toggleBigPicture() {
+  if(bigPictureVisible) {
+    deleteBigPicture();
+    bigPictureVisible = false;
+  } else {
+    addBigPicture();
+    bigPictureVisible = true;
+  }
+}
+
+showBigPicture();
diff --git a/js/update_big_picture.js b/js/update_big_picture.js
new file mode 100644
index 00000000..1984215a
--- /dev/null
+++ b/js/update_big_picture.js
@@ -0,0 +1,7 @@
+function updateBigPicture() {
+  var existingElement = document.querySelector('.bigProfilePicture');
+  if (existingElement) {
+    var timestamp = new Date().getTime();
+    existingElement.src = "/file/cache/pfp_character.png?time=" + timestamp;
+  }
+}
diff --git a/modules/chat.py b/modules/chat.py
index dda16749..436e3e43 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -544,9 +544,13 @@ def generate_pfp_cache(character):
 
     for path in [Path(f"characters/{character}.{extension}") for extension in ['png', 'jpg', 'jpeg']]:
         if path.exists():
-            img = make_thumbnail(Image.open(path))
-            img.save(Path('cache/pfp_character.png'), format='PNG')
-            return img
+            original_img = Image.open(path)
+            original_img.save(Path('cache/pfp_character.png'), format='PNG')
+
+            thumb = make_thumbnail(original_img)
+            thumb.save(Path('cache/pfp_character_thumb.png'), format='PNG')
+
+            return thumb
 
     return None
 
@@ -575,8 +579,9 @@ def load_character(character, name1, name2, instruct=False):
     file_contents = open(filepath, 'r', encoding='utf-8').read()
     data = json.loads(file_contents) if extension == "json" else yaml.safe_load(file_contents)
 
-    if Path("cache/pfp_character.png").exists() and not instruct:
-        Path("cache/pfp_character.png").unlink()
+    for path in [Path("cache/pfp_character.png"), Path("cache/pfp_character_thumb.png")]:
+        if path.exists() and not instruct:
+            path.unlink()
 
     picture = generate_pfp_cache(character)
 
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 26e47848..2a6509b3 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -225,7 +225,7 @@ def generate_cai_chat_html(history, name1, name2, style, reset_cache=False):
     output = f'<style>{chat_styles[style]}</style><div class="chat" id="chat"><div class="messages">'
 
     # We use ?name2 and ?time.time() to force the browser to reset caches
-    img_bot = f'<img src="file/cache/pfp_character.png?{name2}">' if Path("cache/pfp_character.png").exists() else ''
+    img_bot = f'<img src="file/cache/pfp_character_thumb.png?{name2}" class="pfp_character">' if Path("cache/pfp_character_thumb.png").exists() else ''
     img_me = f'<img src="file/cache/pfp_me.png?{time.time() if reset_cache else ""}">' if Path("cache/pfp_me.png").exists() else ''
 
     for i, _row in enumerate(history):
diff --git a/modules/ui.py b/modules/ui.py
index 383bc66f..9e2d6b6a 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -20,6 +20,8 @@ with open(Path(__file__).resolve().parent / '../js/switch_tabs.js', 'r') as f:
     switch_tabs_js = f.read()
 with open(Path(__file__).resolve().parent / '../js/show_controls.js', 'r') as f:
     show_controls_js = f.read()
+with open(Path(__file__).resolve().parent / '../js/update_big_picture.js', 'r') as f:
+    update_big_picture_js = f.read()
 
 refresh_symbol = '🔄'
 delete_symbol = '🗑️'
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index b3cff3d6..40c8d71c 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -275,7 +275,8 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.load_latest_history, gradio('interface_state'), gradio('history')).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(
-        lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id'))
+        lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.update_big_picture_js}; updateBigPicture()}}')
 
     shared.gradio['mode'].change(
         lambda x: gr.update(visible=x != 'instruct'), gradio('mode'), gradio('chat_style'), show_progress=False).then(

From ab94f0d9bf0dddb8a7465bd40637f1e17e5dd832 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 18 Nov 2023 21:11:04 -0800
Subject: [PATCH 13/18] Minor style change

---
 modules/chat.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/chat.py b/modules/chat.py
index 436e3e43..22b5bf9a 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -96,6 +96,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
             context_instruct = context_instruct.replace('<|system-message|>', state['custom_system_message'])
         else:
             context_instruct = context_instruct.replace('<|system-message|>', state['system_message'])
+
         wrapper += context_instruct
         wrapper += all_substrings['instruct']['user_turn'].replace('<|user-message|>', command)
         wrapper += all_substrings['instruct']['bot_turn_stripped']

From a290d1738601cd21ad8ade5b770f220efdc15dac Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 19 Nov 2023 06:53:41 -0800
Subject: [PATCH 14/18] Add hover cursor to bot pfp

---
 css/main.css | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/css/main.css b/css/main.css
index b06d809d..a3480fe0 100644
--- a/css/main.css
+++ b/css/main.css
@@ -659,6 +659,10 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     width: calc((100vw - 880px - 120px) /2);
 }
 
+.pfp_character:hover {
+    cursor: pointer;
+}
+
 @media screen and (width <= 1300px) {
     .bigProfilePicture {
         display: none;

From a6f1e1bcc51ce9b0db62f095f23d329166a6ce9a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 19 Nov 2023 07:55:25 -0800
Subject: [PATCH 15/18] Fix PEFT LoRA unloading

---
 modules/LoRA.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/modules/LoRA.py b/modules/LoRA.py
index 4b119994..9c6edbf3 100644
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@@ -149,10 +149,7 @@ def add_lora_transformers(lora_names):
 
     # If any LoRA needs to be removed, start over
     if len(removed_set) > 0:
-        # shared.model may no longer be PeftModel
-        if hasattr(shared.model, 'disable_adapter'):
-            shared.model.disable_adapter()
-            shared.model = shared.model.base_model.model
+        shared.model = shared.model.unload()
 
     if len(lora_names) > 0:
         params = {}
@@ -172,8 +169,6 @@ def add_lora_transformers(lora_names):
         if len(lora_names) > 1:
             merge_loras()
 
-        shared.lora_names = lora_names
-
         if not shared.args.load_in_8bit and not shared.args.cpu:
             shared.model.half()
             if not hasattr(shared.model, "hf_device_map"):
@@ -186,6 +181,8 @@ def add_lora_transformers(lora_names):
                 else:
                     shared.model = shared.model.cuda()
 
+    shared.lora_names = lora_names
+
 
 def merge_loras():
     if len(list({shared.model.peft_config[adapter].r for adapter in shared.model.peft_config.keys()})) > 1:

From 78af3b0a008d47d549729d0ceb07223b14d002f4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 19 Nov 2023 07:57:16 -0800
Subject: [PATCH 16/18] Update docs/What Works.md

---
 docs/What Works.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/What Works.md b/docs/What Works.md
index 86936039..dba34a80 100644
--- a/docs/What Works.md	
+++ b/docs/What Works.md	
@@ -2,13 +2,13 @@
 
 | Loader         | Loading 1 LoRA | Loading 2 or more LoRAs | Training LoRAs | Multimodal extension | Perplexity evaluation |
 |----------------|----------------|-------------------------|----------------|----------------------|-----------------------|
-| Transformers   |       ✅       |           ✅            |       ✅*       |          ✅          |           ✅          |
+| Transformers   |       ✅       |           ✅***            |       ✅*       |          ✅          |           ✅          |
 | ExLlama_HF     |       ✅       |           ❌            |       ❌       |          ❌          |           ✅          |
 | ExLlamav2_HF   |       ✅       |           ✅            |       ❌       |          ❌          |           ✅          |
 | ExLlama        |       ✅       |           ❌            |       ❌       |          ❌          |           use ExLlama_HF      |
 | ExLlamav2      |       ✅       |           ✅            |       ❌       |          ❌          |           use ExLlamav2_HF    |
 | AutoGPTQ       |       ✅       |           ❌            |       ❌       |          ✅          |           ✅          |
-| GPTQ-for-LLaMa |       ✅**       |           ✅            |       ✅       |          ✅          |           ✅          |
+| GPTQ-for-LLaMa |       ✅**       |           ✅***            |       ✅       |          ✅          |           ✅          |
 | llama.cpp      |       ❌       |           ❌            |       ❌       |          ❌          |           use llamacpp_HF    |
 | llamacpp_HF    |       ❌       |           ❌            |       ❌       |          ❌          |           ✅          |
 | ctransformers  |       ❌       |           ❌            |       ❌       |          ❌          |           ❌          |
@@ -21,3 +21,5 @@
 \* Training LoRAs with GPTQ models also works with the Transformers loader. Make sure to check "auto-devices" and "disable_exllama" before loading the model.
 
 \*\* Requires the monkey-patch. The instructions can be found [here](https://github.com/oobabooga/text-generation-webui/wiki/08-%E2%80%90-Additional-Tips#using-loras-with-gptq-for-llama).
+
+\*\*\* Multi-LoRA in PEFT is tricky and the current implementation does not work reliably in all cases.

From 9da7bb203d4011cd743f79d144c9c298d4d1b089 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 19 Nov 2023 07:59:29 -0800
Subject: [PATCH 17/18] Minor LoRA bug fix

---
 modules/LoRA.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/LoRA.py b/modules/LoRA.py
index 9c6edbf3..dea476ad 100644
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@@ -145,6 +145,7 @@ def add_lora_transformers(lora_names):
         if len(lora_names) > 1:
             merge_loras()
 
+        shared.lora_names = lora_names
         return
 
     # If any LoRA needs to be removed, start over

From 8cf05c1b31cca57653549c2cf67e52ca713c2d62 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 19 Nov 2023 08:31:01 -0800
Subject: [PATCH 18/18] Fix disappearing character gallery

---
 extensions/gallery/script.py | 14 ++++++++------
 js/main.js                   |  1 -
 js/show_controls.js          |  6 ++++++
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/extensions/gallery/script.py b/extensions/gallery/script.py
index 611a11f4..efe96ba9 100644
--- a/extensions/gallery/script.py
+++ b/extensions/gallery/script.py
@@ -91,11 +91,13 @@ def ui():
     with gr.Accordion("Character gallery", open=False, elem_id='gallery-extension'):
         update = gr.Button("Refresh")
         gr.HTML(value="<style>" + generate_css() + "</style>")
-        gallery = gr.Dataset(components=[gr.HTML(visible=False)],
-                             label="",
-                             samples=generate_html(),
-                             elem_classes=["character-gallery"],
-                             samples_per_page=50
-                             )
+        gallery = gr.Dataset(
+            components=[gr.HTML(visible=False)],
+            label="",
+            samples=generate_html(),
+            elem_classes=["character-gallery"],
+            samples_per_page=50
+        )
+
     update.click(generate_html, [], gallery)
     gallery.select(select_character, None, gradio['character_menu'])
diff --git a/js/main.js b/js/main.js
index 2e0c765b..1e50e147 100644
--- a/js/main.js
+++ b/js/main.js
@@ -373,4 +373,3 @@ function toggleBigPicture() {
   }
 }
 
-showBigPicture();
diff --git a/js/show_controls.js b/js/show_controls.js
index 0173963b..0ba1ecde 100644
--- a/js/show_controls.js
+++ b/js/show_controls.js
@@ -10,6 +10,12 @@ function toggle_controls(value) {
     chatParent.classList.remove("bigchat");
     document.getElementById("chat-input-row").classList.remove("bigchat");
     document.getElementById("chat-col").classList.remove("bigchat");
+
+    let gallery_element = document.getElementById('gallery-extension');
+    if (gallery_element) {
+      gallery_element.style.display = 'block';
+    }
+
   } else {
     belowChatInput.forEach(element => {
       element.style.display = "none";