From e0ca49ed9cd231da6ed2de52da435428622f7904 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 18 Nov 2023 00:31:27 -0300 Subject: [PATCH 01/18] Bump llama-cpp-python to 0.2.18 (2nd attempt) (#4637) * Update requirements*.txt * Add back seed --- README.md | 1 - modules/llamacpp_hf.py | 11 +++++------ modules/llamacpp_model.py | 10 +++++----- modules/loaders.py | 2 +- modules/shared.py | 2 +- modules/ui.py | 1 - modules/ui_model_menu.py | 1 - requirements.txt | 32 ++++++++++++++++---------------- requirements_amd.txt | 24 ++++++++++++------------ requirements_amd_noavx2.txt | 16 ++++++++-------- requirements_apple_intel.txt | 28 ++++++++++++---------------- requirements_apple_silicon.txt | 32 ++++++++++++++++---------------- requirements_cpu_only.txt | 16 ++++++++-------- requirements_cpu_only_noavx2.txt | 16 ++++++++-------- requirements_noavx2.txt | 32 ++++++++++++++++---------------- 15 files changed, 108 insertions(+), 116 deletions(-) diff --git a/README.md b/README.md index 00f5ce81..3726b26c 100644 --- a/README.md +++ b/README.md @@ -325,7 +325,6 @@ Optionally, you can use the following command-line flags: | `--mlock` | Force the system to keep the model in RAM. | | `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. | | `--tensor_split TENSOR_SPLIT` | Split the model across multiple GPUs. Comma-separated list of proportions. Example: 18,17. | -| `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default is 0 (random). | | `--numa` | Activate NUMA task allocation for llama.cpp. | | `--logits_all`| Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower. | | `--cache-capacity CACHE_CAPACITY` | Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. | diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py index 5d42e94c..06a66302 100644 --- a/modules/llamacpp_hf.py +++ b/modules/llamacpp_hf.py @@ -39,7 +39,7 @@ class LlamacppHF(PreTrainedModel): 'n_tokens': self.model.n_tokens, 'input_ids': self.model.input_ids, 'scores': self.model.scores, - 'ctx': self.model.ctx + 'ctx': self.model._ctx } if shared.args.cfg_cache: @@ -65,7 +65,7 @@ class LlamacppHF(PreTrainedModel): 'n_tokens': self.model.n_tokens, 'input_ids': self.model.input_ids, 'scores': self.model.scores, - 'ctx': self.model.ctx + 'ctx': self.model._ctx }) def save_negative_cache(self): @@ -73,20 +73,20 @@ class LlamacppHF(PreTrainedModel): 'n_tokens': self.model.n_tokens, 'input_ids': self.model.input_ids, 'scores': self.model.scores, - 'ctx': self.model.ctx + 'ctx': self.model._ctx }) def load_cache(self): self.model.n_tokens = self.llamacpp_cache['n_tokens'] self.model.input_ids = self.llamacpp_cache['input_ids'] self.model.scores = self.llamacpp_cache['scores'] - self.model.ctx = self.llamacpp_cache['ctx'] + self.model._ctx = self.llamacpp_cache['ctx'] def load_negative_cache(self): self.model.n_tokens = self.llamacpp_cache_negative['n_tokens'] self.model.input_ids = self.llamacpp_cache_negative['input_ids'] self.model.scores = self.llamacpp_cache_negative['scores'] - self.model.ctx = self.llamacpp_cache_negative['ctx'] + self.model._ctx = self.llamacpp_cache_negative['ctx'] @property def device(self) -> torch.device: @@ -192,7 +192,6 @@ class LlamacppHF(PreTrainedModel): params = { 'model_path': str(model_file), 'n_ctx': shared.args.n_ctx, - 'seed': int(shared.args.llama_cpp_seed), 'n_threads': shared.args.threads or None, 'n_threads_batch': shared.args.threads_batch or None, 'n_batch': shared.args.n_batch, diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 93f22e95..4f72c4eb 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -74,7 +74,6 @@ class LlamaCppModel: params = { 'model_path': str(path), 'n_ctx': shared.args.n_ctx, - 'seed': int(shared.args.llama_cpp_seed), 'n_threads': shared.args.threads or None, 'n_threads_batch': shared.args.threads_batch or None, 'n_batch': shared.args.n_batch, @@ -144,15 +143,16 @@ class LlamaCppModel: max_tokens=state['max_new_tokens'], temperature=state['temperature'], top_p=state['top_p'], - top_k=state['top_k'], - repeat_penalty=state['repetition_penalty'], - presence_penalty=state['presence_penalty'], frequency_penalty=state['frequency_penalty'], + presence_penalty=state['presence_penalty'], + repeat_penalty=state['repetition_penalty'], + top_k=state['top_k'], + stream=True, + seed=int(state['seed']) if state['seed'] != -1 else None, tfs_z=state['tfs'], mirostat_mode=int(state['mirostat_mode']), mirostat_tau=state['mirostat_tau'], mirostat_eta=state['mirostat_eta'], - stream=True, logits_processor=logit_processors, grammar=self.grammar ) diff --git a/modules/loaders.py b/modules/loaders.py index b3763f06..bf95a6f2 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -99,7 +99,6 @@ loaders_and_params = OrderedDict({ 'no_mmap', 'mlock', 'no_mul_mat_q', - 'llama_cpp_seed', 'alpha_value', 'rope_freq_base', 'compress_pos_emb', @@ -366,6 +365,7 @@ loaders_samplers = { 'repetition_penalty', 'presence_penalty', 'frequency_penalty', + 'seed', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', diff --git a/modules/shared.py b/modules/shared.py index d40a1e77..54e72a6c 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -112,7 +112,6 @@ parser.add_argument('--no-mmap', action='store_true', help='Prevent mmap from be parser.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.') parser.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.') parser.add_argument('--tensor_split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 18,17.') -parser.add_argument('--llama_cpp_seed', type=int, default=0, help='Seed for llama-cpp models. Default is 0 (random).') parser.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.') parser.add_argument('--logits_all', action='store_true', help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.') parser.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.') @@ -182,6 +181,7 @@ parser.add_argument('--no-stream', action='store_true', help='DEPRECATED') parser.add_argument('--mul_mat_q', action='store_true', help='DEPRECATED') parser.add_argument('--api-blocking-port', type=int, default=5000, help='DEPRECATED') parser.add_argument('--api-streaming-port', type=int, default=5005, help='DEPRECATED') +parser.add_argument('--llama_cpp_seed', type=int, default=0, help='DEPRECATED') parser.add_argument('--use_fast', action='store_true', help='DEPRECATED') args = parser.parse_args() diff --git a/modules/ui.py b/modules/ui.py index de649668..383bc66f 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -80,7 +80,6 @@ def list_model_elements(): 'n_gpu_layers', 'tensor_split', 'n_ctx', - 'llama_cpp_seed', 'gpu_split', 'max_seq_len', 'compress_pos_emb', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 67396b78..12edeed9 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -120,7 +120,6 @@ def create_ui(): shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit) shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant) shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17') - shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed) shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.', interactive=shared.args.trust_remote_code) shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.') shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.') diff --git a/requirements.txt b/requirements.txt index d8e4d9f2..6377723e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,14 +27,14 @@ bitsandbytes==0.41.1; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows" # llama-cpp-python (CPU only, AVX2) -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp311-cp311-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp310-cp310-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp39-cp39-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp38-cp38-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" # CUDA wheels https://github.com/jllllll/AutoGPTQ/releases/download/v0.5.1/auto_gptq-0.5.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" @@ -67,14 +67,14 @@ https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" diff --git a/requirements_amd.txt b/requirements_amd.txt index 913758bf..8f8f44ff 100644 --- a/requirements_amd.txt +++ b/requirements_amd.txt @@ -27,14 +27,14 @@ bitsandbytes==0.38.1; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.38.1-py3-none-win_amd64.whl; platform_system == "Windows" # llama-cpp-python (CPU only, AVX2) -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp311-cp311-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp310-cp310-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp39-cp39-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp38-cp38-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" # AMD wheels https://github.com/jllllll/AutoGPTQ/releases/download/v0.5.1/auto_gptq-0.5.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" @@ -45,10 +45,10 @@ https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+rocm5 https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+rocm5.6-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+rocm5.6-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.11+rocm5.6.1-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.11+rocm5.6.1-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.11+rocm5.6.1-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.11+rocm5.6.1-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.18+rocm5.6.1-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.18+rocm5.6.1-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.18+rocm5.6.1-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.18+rocm5.6.1-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt index 0c18296c..455eba44 100644 --- a/requirements_amd_noavx2.txt +++ b/requirements_amd_noavx2.txt @@ -27,14 +27,14 @@ bitsandbytes==0.38.1; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.38.1-py3-none-win_amd64.whl; platform_system == "Windows" # llama-cpp-python (CPU only, no AVX2) -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" # AMD wheels https://github.com/jllllll/AutoGPTQ/releases/download/v0.5.1/auto_gptq-0.5.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt index 83e1db0e..d5dfb525 100644 --- a/requirements_apple_intel.txt +++ b/requirements_apple_intel.txt @@ -27,19 +27,15 @@ bitsandbytes==0.41.1; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows" # Mac wheels -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp311-cp311-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp310-cp310-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp39-cp39-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp38-cp38-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.8" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp39-cp39-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp38-cp38-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.8" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp311-cp311-macosx_13_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp310-cp310-macosx_13_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp39-cp39-macosx_13_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp38-cp38-macosx_13_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.8" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp39-cp39-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp38-cp38-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.8" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp311-cp311-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp310-cp310-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp39-cp39-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.9" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp38-cp38-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.8" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp39-cp39-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.9" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp38-cp38-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.8" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp39-cp39-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.9" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp38-cp38-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.8" diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt index 35760f63..6fe6a76a 100644 --- a/requirements_apple_silicon.txt +++ b/requirements_apple_silicon.txt @@ -27,19 +27,19 @@ bitsandbytes==0.41.1; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows" # Mac wheels -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp311-cp311-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp310-cp310-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp39-cp39-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp38-cp38-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.8" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp39-cp39-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp38-cp38-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.8" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp39-cp39-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp38-cp38-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.8" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp39-cp39-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.11-cp38-cp38-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.8" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp311-cp311-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp310-cp310-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp39-cp39-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.9" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp38-cp38-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.8" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp39-cp39-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.9" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp38-cp38-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.8" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp39-cp39-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.9" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp38-cp38-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.8" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp39-cp39-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.9" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.18-cp38-cp38-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.8" diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt index a3722d3a..56747a70 100644 --- a/requirements_cpu_only.txt +++ b/requirements_cpu_only.txt @@ -27,11 +27,11 @@ bitsandbytes==0.41.1; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows" # llama-cpp-python (CPU only, AVX2) -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp311-cp311-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp310-cp310-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp39-cp39-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp38-cp38-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt index 401ecd6a..412f974f 100644 --- a/requirements_cpu_only_noavx2.txt +++ b/requirements_cpu_only_noavx2.txt @@ -27,11 +27,11 @@ bitsandbytes==0.41.1; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows" # llama-cpp-python (CPU only, no AVX2) -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt index 11f52509..32abd7d9 100644 --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -27,14 +27,14 @@ bitsandbytes==0.41.1; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows" # llama-cpp-python (CPU only, no AVX2) -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.11+cpuavx-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.18+cpuavx-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" # CUDA wheels https://github.com/jllllll/AutoGPTQ/releases/download/v0.5.1/auto_gptq-0.5.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" @@ -67,14 +67,14 @@ https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121avx-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121avx-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121avx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121avx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121avx-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" -https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121avx-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121avx-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121avx-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121avx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121avx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121avx-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" +https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121avx-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" From d1a58da52f0a49bacba5e0fcbe9b2a374eb40833 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 17 Nov 2023 19:52:30 -0800 Subject: [PATCH 02/18] Update ancient Docker instructions --- README.md | 2 +- docker/.dockerignore | 1 - docker/.env.example | 14 ++------------ docker/Dockerfile | 2 +- docker/docker-compose.yml | 1 - docs/09 - Docker.md | 18 ++++++++++++------ 6 files changed, 16 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 3726b26c..3ffaaf10 100644 --- a/README.md +++ b/README.md @@ -169,7 +169,7 @@ cp docker/.env.example .env docker compose up --build ``` -* You need to have docker compose v2.17 or higher installed. See [this guide](https://github.com/oobabooga/text-generation-webui/wiki/09-%E2%80%90-Docker) for instructions. +* You need to have Docker Compose v2.17 or higher installed. See [this guide](https://github.com/oobabooga/text-generation-webui/wiki/09-%E2%80%90-Docker) for instructions. * For additional docker files, check out [this repository](https://github.com/Atinoda/text-generation-webui-docker). ### Updating the requirements diff --git a/docker/.dockerignore b/docker/.dockerignore index 6073533e..99d0adff 100644 --- a/docker/.dockerignore +++ b/docker/.dockerignore @@ -5,5 +5,4 @@ Dockerfile /models /presets /prompts -/softprompts /training diff --git a/docker/.env.example b/docker/.env.example index 3119a9f0..b254f53b 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -3,13 +3,8 @@ # https://developer.nvidia.com/cuda-gpus you can find the version for your card here TORCH_CUDA_ARCH_LIST=7.5 -# these commands worked for me with roughly 4.5GB of vram -CLI_ARGS=--model llama-7b-4bit --wbits 4 --listen --auto-devices - -# the following examples have been tested with the files linked in docs/README_docker.md: -# example running 13b with 4bit/128 groupsize : CLI_ARGS=--model llama-13b-4bit-128g --wbits 4 --listen --groupsize 128 --pre_layer 25 -# example with loading api extension and public share: CLI_ARGS=--model llama-7b-4bit --wbits 4 --listen --auto-devices --no-stream --extensions api --share -# example running 7b with 8bit groupsize : CLI_ARGS=--model llama-7b --load-in-8bit --listen --auto-devices +# your command-line flags go here: +CLI_ARGS= # the port the webui binds to on the host HOST_PORT=7860 @@ -21,10 +16,5 @@ HOST_API_PORT=5000 # the port the api binds to inside the container CONTAINER_API_PORT=5000 -# the port the api stream endpoint binds to on the host -HOST_API_STREAM_PORT=5005 -# the port the api stream endpoint binds to inside the container -CONTAINER_API_STREAM_PORT=5005 - # the version used to install text-generation-webui from WEBUI_VERSION=HEAD diff --git a/docker/Dockerfile b/docker/Dockerfile index 722bc8fc..5752b2a7 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -73,5 +73,5 @@ RUN --mount=type=cache,target=/root/.cache/pip,rw \ ENV CLI_ARGS="" -EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005} +EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} CMD . /app/venv/bin/activate && python3 server.py ${CLI_ARGS} diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index ce29f33b..29767d22 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -11,7 +11,6 @@ services: ports: - "${HOST_PORT:-7860}:${CONTAINER_PORT:-7860}" - "${HOST_API_PORT:-5000}:${CONTAINER_API_PORT:-5000}" - - "${HOST_API_STREAM_PORT:-5005}:${CONTAINER_API_STREAM_PORT:-5005}" stdin_open: true tty: true volumes: diff --git a/docs/09 - Docker.md b/docs/09 - Docker.md index 921864bf..bddc5272 100644 --- a/docs/09 - Docker.md +++ b/docs/09 - Docker.md @@ -1,13 +1,21 @@ Docker Compose is a way of installing and launching the web UI in an isolated Ubuntu image using only a few commands. -In order to create the image as described in the main README, you must have docker compose 2.17 or higher: +## Installing Docker Compose + +In order to create the image as described in the main README, you must have Docker Compose installed (2.17 or higher is recommended): ``` ~$ docker compose version -Docker Compose version v2.17.2 +Docker Compose version v2.21.0 ``` -Make sure to also create the necessary symbolic links: +The installation instructions for various Linux distributions can be found here: + +https://docs.docker.com/engine/install/ubuntu/#install-using-the-repository + +## Launching the image + +Use these commands to launch the image: ``` cd text-generation-webui @@ -17,13 +25,11 @@ cp docker/.env.example .env docker compose up --build ``` -## Table of contents +## More detailed installation instructions * [Docker Compose installation instructions](#docker-compose-installation-instructions) * [Repository with additional Docker files](#dedicated-docker-repository) -## Docker Compose installation instructions - By [@loeken](https://github.com/loeken). - [Ubuntu 22.04](#ubuntu-2204) From 83b64e7fc186fbb5649252ea1aba0ae46e642cf3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 18 Nov 2023 18:31:41 -0300 Subject: [PATCH 03/18] New feature: "random preset" button (#4647) --- docs/03 - Parameters Tab.md | 8 +++++-- modules/presets.py | 43 +++++++++++++++++++++++++++++++++++++ modules/ui_parameters.py | 2 ++ 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/docs/03 - Parameters Tab.md b/docs/03 - Parameters Tab.md index a66fbbb8..601cca86 100644 --- a/docs/03 - Parameters Tab.md +++ b/docs/03 - Parameters Tab.md @@ -11,9 +11,13 @@ LLMs work by generating one token at a time. Given your prompt, the model calcul ### Preset menu -Can be used to save combinations of parameters for reuse. +Can be used to save and load combinations of parameters for reuse. -The built-in presets were not manually chosen. They were obtained after a blind contest called "Preset Arena" where hundreds of people voted. The full results can be found [here](https://github.com/oobabooga/oobabooga.github.io/blob/main/arena/results.md). +* **🎲 button**: creates a random yet interpretable preset. Only 1 parameter of each category is included for the categories: removing tail tokens, avoiding repetition, and flattening the distribution. That is, top_p and top_k are not mixed, and neither are repetition_penalty and frequency_penalty. You can use this button to break out of a loop of bad generations after multiple "Regenerate" attempts. + +#### Built-in presets + +These were obtained after a blind contest called "Preset Arena" where hundreds of people voted. The full results can be found [here](https://github.com/oobabooga/oobabooga.github.io/blob/main/arena/results.md). A key takeaway is that the best presets are: diff --git a/modules/presets.py b/modules/presets.py index 5082678b..842992f9 100644 --- a/modules/presets.py +++ b/modules/presets.py @@ -1,8 +1,12 @@ import functools +import random from pathlib import Path import yaml +from modules import shared +from modules.loaders import loaders_samplers + def default_preset(): return { @@ -63,6 +67,45 @@ def load_preset_for_ui(name, state): return state, *[generate_params[k] for k in presets_params()] +def random_preset(state): + params_and_values = { + 'remove_tail_tokens': { + 'top_p': [0.5, 0.8, 0.9, 0.95, 0.99], + 'min_p': [0.5, 0.2, 0.1, 0.05, 0.01], + 'top_k': [3, 5, 10, 20, 30, 40], + 'typical_p': [0.2, 0.575, 0.95], + 'tfs': [0.5, 0.8, 0.9, 0.95, 0.99], + 'top_a': [0.5, 0.2, 0.1, 0.05, 0.01], + 'epsilon_cutoff': [1, 3, 5, 7, 9], + 'eta_cutoff': [3, 6, 9, 12, 15, 18], + }, + 'flatten_distribution': { + 'temperature': [0.5, 0.7, 0.8, 1, 1.2, 1.5, 2.0], + }, + 'repetition': { + 'repetition_penalty': [1, 1.05, 1.1, 1.15, 1.20, 1.25], + 'presence_penalty': [0, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 2.0], + 'frequency_penalty': [0, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 2.0], + }, + 'other': { + 'temperature_last': [True, False], + } + } + + generate_params = default_preset() + for cat in params_and_values: + choices = list(params_and_values[cat].keys()) + if shared.args.loader is not None: + choices = [x for x in choices if x in loaders_samplers[shared.args.loader]] + + if len(choices) > 0: + choice = random.choice(choices) + generate_params[choice] = random.choice(params_and_values[cat][choice]) + + state.update(generate_params) + return state, *[generate_params[k] for k in presets_params()] + + def generate_preset_yaml(state): defaults = default_preset() data = {k: state[k] for k in presets_params()} diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py index fa245c4d..0c53963e 100644 --- a/modules/ui_parameters.py +++ b/modules/ui_parameters.py @@ -18,6 +18,7 @@ def create_ui(default_preset): ui.create_refresh_button(shared.gradio['preset_menu'], lambda: None, lambda: {'choices': utils.get_available_presets()}, 'refresh-button', interactive=not mu) shared.gradio['save_preset'] = gr.Button('💾', elem_classes='refresh-button', interactive=not mu) shared.gradio['delete_preset'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu) + shared.gradio['random_preset'] = gr.Button('🎲', elem_classes='refresh-button') with gr.Column(): shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()), value="All", elem_classes='slim-dropdown') @@ -90,6 +91,7 @@ def create_ui(default_preset): def create_event_handlers(): shared.gradio['filter_by_loader'].change(loaders.blacklist_samplers, gradio('filter_by_loader'), gradio(loaders.list_all_samplers()), show_progress=False) shared.gradio['preset_menu'].change(presets.load_preset_for_ui, gradio('preset_menu', 'interface_state'), gradio('interface_state') + gradio(presets.presets_params())) + shared.gradio['random_preset'].click(presets.random_preset, gradio('interface_state'), gradio('interface_state') + gradio(presets.presets_params())) shared.gradio['grammar_file'].change(load_grammar, gradio('grammar_file'), gradio('grammar_string')) From 47d9e2618bd2c4edfbdd13376a6c1b0d5575da4d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 18 Nov 2023 14:03:42 -0800 Subject: [PATCH 04/18] Refresh the Preset menu after saving a preset --- modules/ui_file_saving.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/modules/ui_file_saving.py b/modules/ui_file_saving.py index 1625c830..39ab41d4 100644 --- a/modules/ui_file_saving.py +++ b/modules/ui_file_saving.py @@ -37,6 +37,14 @@ def create_ui(): shared.gradio['delete_character_confirm'] = gr.Button('Delete', elem_classes="small-button", variant='stop', interactive=not mu) shared.gradio['delete_character_cancel'] = gr.Button('Cancel', elem_classes="small-button") + # Preset saver + with gr.Group(visible=False, elem_classes='file-saver') as shared.gradio['preset_saver']: + shared.gradio['save_preset_filename'] = gr.Textbox(lines=1, label='File name', info='The preset will be saved to your presets/ folder with this base filename.') + shared.gradio['save_preset_contents'] = gr.Textbox(lines=10, label='File contents') + with gr.Row(): + shared.gradio['save_preset_confirm'] = gr.Button('Save', elem_classes="small-button", variant='primary', interactive=not mu) + shared.gradio['save_preset_cancel'] = gr.Button('Cancel', elem_classes="small-button") + def create_event_handlers(): shared.gradio['save_confirm'].click( @@ -65,10 +73,16 @@ def create_event_handlers(): shared.gradio['save_preset'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - presets.generate_preset_yaml, gradio('interface_state'), gradio('save_contents')).then( - lambda: 'presets/', None, gradio('save_root')).then( - lambda: 'My Preset.yaml', None, gradio('save_filename')).then( - lambda: gr.update(visible=True), None, gradio('file_saver')) + presets.generate_preset_yaml, gradio('interface_state'), gradio('save_preset_contents')).then( + lambda: 'My Preset', None, gradio('save_preset_filename')).then( + lambda: gr.update(visible=True), None, gradio('preset_saver')) + + shared.gradio['save_preset_confirm'].click( + lambda x, y: utils.save_file(f'presets/{x}.yaml', y), gradio('save_preset_filename', 'save_preset_contents'), None).then( + lambda: gr.update(visible=False), None, gradio('preset_saver')).then( + lambda x: gr.update(choices=utils.get_available_presets(), value=x), gradio('save_preset_filename'), gradio('preset_menu')) + + shared.gradio['save_preset_cancel'].click(lambda: gr.update(visible=False), None, gradio('preset_saver')) shared.gradio['delete_preset'].click( lambda x: f'{x}.yaml', gradio('preset_menu'), gradio('delete_filename')).then( From baab89475989a4f7166b233339e90cb0b9a3cfee Mon Sep 17 00:00:00 2001 From: Jordan Tucker Date: Sat, 18 Nov 2023 17:20:13 -0600 Subject: [PATCH 05/18] fix: use system message in chat-instruct mode (#4648) --- modules/chat.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modules/chat.py b/modules/chat.py index 4c518d33..dda16749 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -91,7 +91,12 @@ def generate_chat_prompt(user_input, state, **kwargs): if state['mode'] == 'chat-instruct': wrapper = '' command = state['chat-instruct_command'].replace('<|character|>', state['name2'] if not impersonate else state['name1']) - wrapper += state['context_instruct'] + context_instruct = state['context_instruct'] + if state['custom_system_message'].strip() != '': + context_instruct = context_instruct.replace('<|system-message|>', state['custom_system_message']) + else: + context_instruct = context_instruct.replace('<|system-message|>', state['system_message']) + wrapper += context_instruct wrapper += all_substrings['instruct']['user_turn'].replace('<|user-message|>', command) wrapper += all_substrings['instruct']['bot_turn_stripped'] if impersonate: From af76fbedb880f380953cf8ebd95d4a6b11eed1ef Mon Sep 17 00:00:00 2001 From: wizd Date: Sun, 19 Nov 2023 07:24:29 +0800 Subject: [PATCH 06/18] Openai embedding fix to support jina-embeddings-v2 (#4642) --- extensions/openai/embeddings.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/extensions/openai/embeddings.py b/extensions/openai/embeddings.py index fcdaab63..1420879c 100644 --- a/extensions/openai/embeddings.py +++ b/extensions/openai/embeddings.py @@ -1,6 +1,7 @@ import os import numpy as np +from transformers import AutoModel from extensions.openai.errors import ServiceUnavailableError from extensions.openai.utils import debug_msg, float_list_to_base64 @@ -41,7 +42,12 @@ def load_embedding_model(model: str): global embeddings_device, embeddings_model try: print(f"Try embedding model: {model} on {embeddings_device}") - embeddings_model = SentenceTransformer(model, device=embeddings_device) + if 'jina-embeddings' in model: + embeddings_model = AutoModel.from_pretrained(model, trust_remote_code=True) # trust_remote_code is needed to use the encode method + embeddings_model = embeddings_model.to(embeddings_device) + else: + embeddings_model = SentenceTransformer(model, device=embeddings_device) + print(f"Loaded embedding model: {model}") except Exception as e: embeddings_model = None From 8f4f4daf8bb7f17bff8e2813053f1aca45e85d8a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 18 Nov 2023 22:33:27 -0300 Subject: [PATCH 07/18] Add --admin-key flag for API (#4649) --- README.md | 1 + extensions/openai/script.py | 50 ++++++++++++++++++++++++------------- modules/shared.py | 1 + 3 files changed, 34 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 3ffaaf10..8c2679cf 100644 --- a/README.md +++ b/README.md @@ -413,6 +413,7 @@ Optionally, you can use the following command-line flags: | `--public-api-id PUBLIC_API_ID` | Tunnel ID for named Cloudflare Tunnel. Use together with public-api option. | | `--api-port API_PORT` | The listening port for the API. | | `--api-key API_KEY` | API authentication key. | +| `--admin-key ADMIN_KEY` | API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key. | #### Multimodal diff --git a/extensions/openai/script.py b/extensions/openai/script.py index 2128444e..43d4b261 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -60,7 +60,15 @@ def verify_api_key(authorization: str = Header(None)) -> None: raise HTTPException(status_code=401, detail="Unauthorized") -app = FastAPI(dependencies=[Depends(verify_api_key)]) +def verify_admin_key(authorization: str = Header(None)) -> None: + expected_api_key = shared.args.admin_key + if expected_api_key and (authorization is None or authorization != f"Bearer {expected_api_key}"): + raise HTTPException(status_code=401, detail="Unauthorized") + + +app = FastAPI() +check_key = [Depends(verify_api_key)] +check_admin_key = [Depends(verify_admin_key)] # Configure CORS settings to allow all origins, methods, and headers app.add_middleware( @@ -72,12 +80,12 @@ app.add_middleware( ) -@app.options("/") +@app.options("/", dependencies=check_key) async def options_route(): return JSONResponse(content="OK") -@app.post('/v1/completions', response_model=CompletionResponse) +@app.post('/v1/completions', response_model=CompletionResponse, dependencies=check_key) async def openai_completions(request: Request, request_data: CompletionRequest): path = request.url.path is_legacy = "/generate" in path @@ -100,7 +108,7 @@ async def openai_completions(request: Request, request_data: CompletionRequest): return JSONResponse(response) -@app.post('/v1/chat/completions', response_model=ChatCompletionResponse) +@app.post('/v1/chat/completions', response_model=ChatCompletionResponse, dependencies=check_key) async def openai_chat_completions(request: Request, request_data: ChatCompletionRequest): path = request.url.path is_legacy = "/generate" in path @@ -123,8 +131,8 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion return JSONResponse(response) -@app.get("/v1/models") -@app.get("/v1/models/{model}") +@app.get("/v1/models", dependencies=check_key) +@app.get("/v1/models/{model}", dependencies=check_key) async def handle_models(request: Request): path = request.url.path is_list = request.url.path.split('?')[0].split('#')[0] == '/v1/models' @@ -138,7 +146,7 @@ async def handle_models(request: Request): return JSONResponse(response) -@app.get('/v1/billing/usage') +@app.get('/v1/billing/usage', dependencies=check_key) def handle_billing_usage(): ''' Ex. /v1/dashboard/billing/usage?start_date=2023-05-01&end_date=2023-05-31 @@ -146,7 +154,7 @@ def handle_billing_usage(): return JSONResponse(content={"total_usage": 0}) -@app.post('/v1/audio/transcriptions') +@app.post('/v1/audio/transcriptions', dependencies=check_key) async def handle_audio_transcription(request: Request): r = sr.Recognizer() @@ -176,7 +184,7 @@ async def handle_audio_transcription(request: Request): return JSONResponse(content=transcription) -@app.post('/v1/images/generations') +@app.post('/v1/images/generations', dependencies=check_key) async def handle_image_generation(request: Request): if not os.environ.get('SD_WEBUI_URL', params.get('sd_webui_url', '')): @@ -192,7 +200,7 @@ async def handle_image_generation(request: Request): return JSONResponse(response) -@app.post("/v1/embeddings", response_model=EmbeddingsResponse) +@app.post("/v1/embeddings", response_model=EmbeddingsResponse, dependencies=check_key) async def handle_embeddings(request: Request, request_data: EmbeddingsRequest): input = request_data.input if not input: @@ -205,7 +213,7 @@ async def handle_embeddings(request: Request, request_data: EmbeddingsRequest): return JSONResponse(response) -@app.post("/v1/moderations") +@app.post("/v1/moderations", dependencies=check_key) async def handle_moderations(request: Request): body = await request.json() input = body["input"] @@ -216,37 +224,37 @@ async def handle_moderations(request: Request): return JSONResponse(response) -@app.post("/v1/internal/encode", response_model=EncodeResponse) +@app.post("/v1/internal/encode", response_model=EncodeResponse, dependencies=check_key) async def handle_token_encode(request_data: EncodeRequest): response = token_encode(request_data.text) return JSONResponse(response) -@app.post("/v1/internal/decode", response_model=DecodeResponse) +@app.post("/v1/internal/decode", response_model=DecodeResponse, dependencies=check_key) async def handle_token_decode(request_data: DecodeRequest): response = token_decode(request_data.tokens) return JSONResponse(response) -@app.post("/v1/internal/token-count", response_model=TokenCountResponse) +@app.post("/v1/internal/token-count", response_model=TokenCountResponse, dependencies=check_key) async def handle_token_count(request_data: EncodeRequest): response = token_count(request_data.text) return JSONResponse(response) -@app.post("/v1/internal/stop-generation") +@app.post("/v1/internal/stop-generation", dependencies=check_key) async def handle_stop_generation(request: Request): stop_everything_event() return JSONResponse(content="OK") -@app.get("/v1/internal/model/info", response_model=ModelInfoResponse) +@app.get("/v1/internal/model/info", response_model=ModelInfoResponse, dependencies=check_key) async def handle_model_info(): payload = OAImodels.get_current_model_info() return JSONResponse(content=payload) -@app.post("/v1/internal/model/load") +@app.post("/v1/internal/model/load", dependencies=check_admin_key) async def handle_load_model(request_data: LoadModelRequest): ''' This endpoint is experimental and may change in the future. @@ -283,7 +291,7 @@ async def handle_load_model(request_data: LoadModelRequest): return HTTPException(status_code=400, detail="Failed to load the model.") -@app.post("/v1/internal/model/unload") +@app.post("/v1/internal/model/unload", dependencies=check_admin_key) async def handle_unload_model(): unload_model() return JSONResponse(content="OK") @@ -308,8 +316,14 @@ def run_server(): logger.info(f'OpenAI-compatible API URL:\n\nhttp://{server_addr}:{port}\n') if shared.args.api_key: + if not shared.args.admin_key: + shared.args.admin_key = shared.args.api_key + logger.info(f'OpenAI API key:\n\n{shared.args.api_key}\n') + if shared.args.admin_key: + logger.info(f'OpenAI API admin key (for loading/unloading models):\n\n{shared.args.admin_key}\n') + uvicorn.run(app, host=server_addr, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile) diff --git a/modules/shared.py b/modules/shared.py index 54e72a6c..b139a2cf 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -170,6 +170,7 @@ parser.add_argument('--public-api', action='store_true', help='Create a public U parser.add_argument('--public-api-id', type=str, help='Tunnel ID for named Cloudflare Tunnel. Use together with public-api option.', default=None) parser.add_argument('--api-port', type=int, default=5000, help='The listening port for the API.') parser.add_argument('--api-key', type=str, default='', help='API authentication key.') +parser.add_argument('--admin-key', type=str, default='', help='API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.') # Multimodal parser.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.') From 0fa1af296c18854722c348ae068e52551f6efe49 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 18 Nov 2023 23:19:31 -0300 Subject: [PATCH 08/18] Add /v1/internal/logits endpoint (#4650) --- docs/12 - OpenAI API.md | 23 +++++++++++++++++++++++ extensions/openai/script.py | 13 +++++++++++++ extensions/openai/typing.py | 26 ++++++++++++++++++++++---- modules/logits.py | 18 +++++++++++++----- 4 files changed, 71 insertions(+), 9 deletions(-) diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md index 05b4db02..abbd432d 100644 --- a/docs/12 - OpenAI API.md +++ b/docs/12 - OpenAI API.md @@ -97,6 +97,29 @@ curl http://127.0.0.1:5000/v1/chat/completions \ }' ``` +#### Logits + +``` +curl -k http://127.0.0.1:5000/v1/internal/logits \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "Who is best, Asuka or Rei? Answer:", + "use_samplers": false + }' +``` + +#### Logits after sampling parameters + +``` +curl -k http://127.0.0.1:5000/v1/internal/logits \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "Who is best, Asuka or Rei? Answer:", + "use_samplers": true, + "top_k": 3 + }' +``` + #### Python chat example ```python diff --git a/extensions/openai/script.py b/extensions/openai/script.py index 43d4b261..da56287c 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -16,6 +16,7 @@ from sse_starlette import EventSourceResponse import extensions.openai.completions as OAIcompletions import extensions.openai.embeddings as OAIembeddings import extensions.openai.images as OAIimages +import extensions.openai.logits as OAIlogits import extensions.openai.models as OAImodels import extensions.openai.moderations as OAImoderations from extensions.openai.errors import ServiceUnavailableError @@ -38,6 +39,8 @@ from .typing import ( EncodeRequest, EncodeResponse, LoadModelRequest, + LogitsRequest, + LogitsResponse, ModelInfoResponse, TokenCountResponse, to_dict @@ -242,6 +245,16 @@ async def handle_token_count(request_data: EncodeRequest): return JSONResponse(response) +@app.post("/v1/internal/logits", response_model=LogitsResponse, dependencies=check_key) +async def handle_logits(request_data: LogitsRequest): + ''' + Given a prompt, returns the top 50 most likely logits as a dict. + The keys are the tokens, and the values are the probabilities. + ''' + response = OAIlogits._get_next_logits(to_dict(request_data)) + return JSONResponse(response) + + @app.post("/v1/internal/stop-generation", dependencies=check_key) async def handle_stop_generation(request: Request): stop_everything_event() diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py index ee8f2ac6..05d3f753 100644 --- a/extensions/openai/typing.py +++ b/extensions/openai/typing.py @@ -126,15 +126,15 @@ class EncodeRequest(BaseModel): text: str -class DecodeRequest(BaseModel): - tokens: List[int] - - class EncodeResponse(BaseModel): tokens: List[int] length: int +class DecodeRequest(BaseModel): + tokens: List[int] + + class DecodeResponse(BaseModel): text: str @@ -143,6 +143,24 @@ class TokenCountResponse(BaseModel): length: int +class LogitsRequestParams(BaseModel): + prompt: str + use_samplers: bool = False + frequency_penalty: float | None = 0 + max_tokens: int | None = 16 + presence_penalty: float | None = 0 + temperature: float | None = 1 + top_p: float | None = 1 + + +class LogitsRequest(GenerationOptions, LogitsRequestParams): + pass + + +class LogitsResponse(BaseModel): + logits: dict + + class ModelInfoResponse(BaseModel): model_name: str lora_names: List[str] diff --git a/modules/logits.py b/modules/logits.py index e356a986..383659e0 100644 --- a/modules/logits.py +++ b/modules/logits.py @@ -8,7 +8,7 @@ from modules.text_generation import generate_reply global_scores = None -def get_next_logits(prompt, state, use_samplers, previous): +def get_next_logits(prompt, state, use_samplers, previous, return_dict=False): if shared.model is None: logger.error("No model is loaded! Select one in the Model tab.") return 'Error: No model is loaded1 Select one in the Model tab.', previous @@ -56,8 +56,16 @@ def get_next_logits(prompt, state, use_samplers, previous): topk_indices = [i.expand((1, 1)) for i in topk_indices] tokens = [shared.tokenizer.decode(i) for i in topk_indices] - output = '' - for row in list(zip(topk_values, tokens)): - output += f"{row[0]} - {repr(row[1])}\n" - return output, previous + if return_dict: + output = {} + for row in list(zip(topk_values, tokens)): + output[row[1]] = row[0] + + return output + else: + output = '' + for row in list(zip(topk_values, tokens)): + output += f"{row[0]} - {repr(row[1])}\n" + + return output, previous From ef6feedeb22c6c3d045f491200bd7237735b9f78 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 18 Nov 2023 23:38:39 -0300 Subject: [PATCH 09/18] Add --nowebui flag for pure API mode (#4651) --- README.md | 1 + extensions/openai/script.py | 5 ++++- modules/shared.py | 3 ++- server.py | 24 +++++++++++++++--------- 4 files changed, 22 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 8c2679cf..56e810b2 100644 --- a/README.md +++ b/README.md @@ -414,6 +414,7 @@ Optionally, you can use the following command-line flags: | `--api-port API_PORT` | The listening port for the API. | | `--api-key API_KEY` | API authentication key. | | `--admin-key ADMIN_KEY` | API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key. | +| `--nowebui` | Do not launch the Gradio UI. Useful for launching the API in standalone mode. | #### Multimodal diff --git a/extensions/openai/script.py b/extensions/openai/script.py index da56287c..a516b0f7 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -341,4 +341,7 @@ def run_server(): def setup(): - Thread(target=run_server, daemon=True).start() + if shared.args.nowebui: + run_server() + else: + Thread(target=run_server, daemon=True).start() diff --git a/modules/shared.py b/modules/shared.py index b139a2cf..344daf1d 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -171,6 +171,7 @@ parser.add_argument('--public-api-id', type=str, help='Tunnel ID for named Cloud parser.add_argument('--api-port', type=int, default=5000, help='The listening port for the API.') parser.add_argument('--api-key', type=str, default='', help='API authentication key.') parser.add_argument('--admin-key', type=str, default='', help='API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.') +parser.add_argument('--nowebui', action='store_true', help='Do not launch the Gradio UI. Useful for launching the API in standalone mode.') # Multimodal parser.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.') @@ -201,7 +202,7 @@ for k in ['notebook', 'chat', 'no_stream', 'mul_mat_q', 'use_fast']: # Security warnings if args.trust_remote_code: logger.warning('trust_remote_code is enabled. This is dangerous.') -if 'COLAB_GPU' not in os.environ: +if 'COLAB_GPU' not in os.environ and not args.nowebui: if args.share: logger.warning("The gradio \"share link\" feature uses a proprietary executable to create a reverse tunnel. Use it with care.") if any((args.listen, args.share)) and not any((args.gradio_auth, args.gradio_auth_path)): diff --git a/server.py b/server.py index e9605e3b..cdd82e1d 100644 --- a/server.py +++ b/server.py @@ -226,13 +226,19 @@ if __name__ == "__main__": shared.generation_lock = Lock() - # Launch the web UI - create_interface() - while True: - time.sleep(0.5) - if shared.need_restart: - shared.need_restart = False + if shared.args.nowebui: + # Start the API in standalone mode + shared.args.extensions = [x for x in shared.args.extensions if x != 'gallery'] + if shared.args.extensions is not None and len(shared.args.extensions) > 0: + extensions_module.load_extensions() + else: + # Launch the web UI + create_interface() + while True: time.sleep(0.5) - shared.gradio['interface'].close() - time.sleep(0.5) - create_interface() + if shared.need_restart: + shared.need_restart = False + time.sleep(0.5) + shared.gradio['interface'].close() + time.sleep(0.5) + create_interface() From 771e62e4764260dd526a6e3386048a208bf12f87 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 19 Nov 2023 00:35:22 -0300 Subject: [PATCH 10/18] Add /v1/internal/lora endpoints (#4652) --- extensions/openai/models.py | 29 ++++++++++++++++++++--------- extensions/openai/script.py | 32 +++++++++++++++++++++++++++++++- extensions/openai/typing.py | 30 +++++++++++++++++++++--------- 3 files changed, 72 insertions(+), 19 deletions(-) diff --git a/extensions/openai/models.py b/extensions/openai/models.py index 1ff950a2..8a093ebe 100644 --- a/extensions/openai/models.py +++ b/extensions/openai/models.py @@ -1,8 +1,9 @@ from modules import shared from modules.logging_colors import logger +from modules.LoRA import add_lora_to_model from modules.models import load_model, unload_model from modules.models_settings import get_model_metadata, update_model_parameters -from modules.utils import get_available_models +from modules.utils import get_available_loras, get_available_models def get_current_model_info(): @@ -13,12 +14,17 @@ def get_current_model_info(): def list_models(): + return {'model_names': get_available_models()[1:]} + + +def list_dummy_models(): result = { "object": "list", "data": [] } - for model in get_dummy_models() + get_available_models()[1:]: + # these are expected by so much, so include some here as a dummy + for model in ['gpt-3.5-turbo', 'text-embedding-ada-002']: result["data"].append(model_info_dict(model)) return result @@ -33,13 +39,6 @@ def model_info_dict(model_name: str) -> dict: } -def get_dummy_models() -> list: - return [ # these are expected by so much, so include some here as a dummy - 'gpt-3.5-turbo', - 'text-embedding-ada-002', - ] - - def _load_model(data): model_name = data["model_name"] args = data["args"] @@ -67,3 +66,15 @@ def _load_model(data): logger.info(f"TRUNCATION LENGTH (UPDATED): {shared.settings['truncation_length']}") elif k == 'instruction_template': logger.info(f"INSTRUCTION TEMPLATE (UPDATED): {shared.settings['instruction_template']}") + + +def list_loras(): + return {'lora_names': get_available_loras()[1:]} + + +def load_loras(lora_names): + add_lora_to_model(lora_names) + + +def unload_all_loras(): + add_lora_to_model([]) diff --git a/extensions/openai/script.py b/extensions/openai/script.py index a516b0f7..047c339a 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -38,10 +38,13 @@ from .typing import ( EmbeddingsResponse, EncodeRequest, EncodeResponse, + LoadLorasRequest, LoadModelRequest, LogitsRequest, LogitsResponse, + LoraListResponse, ModelInfoResponse, + ModelListResponse, TokenCountResponse, to_dict ) @@ -141,7 +144,7 @@ async def handle_models(request: Request): is_list = request.url.path.split('?')[0].split('#')[0] == '/v1/models' if is_list: - response = OAImodels.list_models() + response = OAImodels.list_dummy_models() else: model_name = path[len('/v1/models/'):] response = OAImodels.model_info_dict(model_name) @@ -267,6 +270,12 @@ async def handle_model_info(): return JSONResponse(content=payload) +@app.get("/v1/internal/model/list", response_model=ModelListResponse, dependencies=check_admin_key) +async def handle_list_models(): + payload = OAImodels.list_models() + return JSONResponse(content=payload) + + @app.post("/v1/internal/model/load", dependencies=check_admin_key) async def handle_load_model(request_data: LoadModelRequest): ''' @@ -307,6 +316,27 @@ async def handle_load_model(request_data: LoadModelRequest): @app.post("/v1/internal/model/unload", dependencies=check_admin_key) async def handle_unload_model(): unload_model() + + +@app.get("/v1/internal/lora/list", response_model=LoraListResponse, dependencies=check_admin_key) +async def handle_list_loras(): + response = OAImodels.list_loras() + return JSONResponse(content=response) + + +@app.post("/v1/internal/lora/load", dependencies=check_admin_key) +async def handle_load_loras(request_data: LoadLorasRequest): + try: + OAImodels.load_loras(request_data.lora_names) + return JSONResponse(content="OK") + except: + traceback.print_exc() + return HTTPException(status_code=400, detail="Failed to apply the LoRA(s).") + + +@app.post("/v1/internal/lora/unload", dependencies=check_admin_key) +async def handle_unload_loras(): + OAImodels.unload_all_loras() return JSONResponse(content="OK") diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py index 05d3f753..5a2d40d5 100644 --- a/extensions/openai/typing.py +++ b/extensions/openai/typing.py @@ -122,6 +122,19 @@ class ChatCompletionResponse(BaseModel): usage: dict +class EmbeddingsRequest(BaseModel): + input: str | List[str] + model: str | None = Field(default=None, description="Unused parameter. To change the model, set the OPENEDAI_EMBEDDING_MODEL and OPENEDAI_EMBEDDING_DEVICE environment variables before starting the server.") + encoding_format: str = Field(default="float", description="Can be float or base64.") + user: str | None = Field(default=None, description="Unused parameter.") + + +class EmbeddingsResponse(BaseModel): + index: int + embedding: List[float] + object: str = "embedding" + + class EncodeRequest(BaseModel): text: str @@ -166,23 +179,22 @@ class ModelInfoResponse(BaseModel): lora_names: List[str] +class ModelListResponse(BaseModel): + model_names: List[str] + + class LoadModelRequest(BaseModel): model_name: str args: dict | None = None settings: dict | None = None -class EmbeddingsRequest(BaseModel): - input: str | List[str] - model: str | None = Field(default=None, description="Unused parameter. To change the model, set the OPENEDAI_EMBEDDING_MODEL and OPENEDAI_EMBEDDING_DEVICE environment variables before starting the server.") - encoding_format: str = Field(default="float", description="Can be float or base64.") - user: str | None = Field(default=None, description="Unused parameter.") +class LoraListResponse(BaseModel): + lora_names: List[str] -class EmbeddingsResponse(BaseModel): - index: int - embedding: List[float] - object: str = "embedding" +class LoadLorasRequest(BaseModel): + lora_names: List[str] def to_json(obj): From cb836dd49c0e24b304455a496b63994329781ef8 Mon Sep 17 00:00:00 2001 From: Jordan Tucker Date: Sat, 18 Nov 2023 22:19:10 -0600 Subject: [PATCH 11/18] fix: use shared chat-instruct_command with api (#4653) --- extensions/openai/completions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index 99525b66..389466ff 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -203,6 +203,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False) - turn_template = body['turn_template'] or turn_template context_instruct = body['context_instruct'] or context_instruct system_message = body['system_message'] or system_message + chat_instruct_command = body['chat_instruct_command'] or shared.settings['chat-instruct_command'] # Chat character character = body['character'] or shared.settings['character'] @@ -228,7 +229,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False) - 'system_message': system_message, 'custom_system_message': custom_system_message, 'turn_template': turn_template, - 'chat-instruct_command': body['chat_instruct_command'], + 'chat-instruct_command': chat_instruct_command, 'history': history, 'stream': stream }) From 5fcee696ea3b0cc553a39213cdb5fbc2da0314c2 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 19 Nov 2023 02:05:17 -0300 Subject: [PATCH 12/18] New feature: enlarge character pictures on click (#4654) --- css/main.css | 16 ++++++++++++++++ js/main.js | 39 +++++++++++++++++++++++++++++++++++++++ js/update_big_picture.js | 7 +++++++ modules/chat.py | 15 ++++++++++----- modules/html_generator.py | 2 +- modules/ui.py | 2 ++ modules/ui_chat.py | 3 ++- 7 files changed, 77 insertions(+), 7 deletions(-) create mode 100644 js/update_big_picture.js diff --git a/css/main.css b/css/main.css index 47506c5e..b06d809d 100644 --- a/css/main.css +++ b/css/main.css @@ -648,3 +648,19 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { .options { z-index: 100 !important; } + +/* ---------------------------------------------- + Big profile picture for characters +---------------------------------------------- */ +.bigProfilePicture { + position: fixed; + bottom: 0; + left: 0; + width: calc((100vw - 880px - 120px) /2); +} + +@media screen and (width <= 1300px) { + .bigProfilePicture { + display: none; + } +} diff --git a/js/main.js b/js/main.js index 17d1d354..2e0c765b 100644 --- a/js/main.js +++ b/js/main.js @@ -312,6 +312,10 @@ document.addEventListener("click", function (event) { if (!isMouseOverButtonOrMenu() && menu.style.display === "flex") { hideMenu(); } + + if (event.target.classList.contains("pfp_character")) { + toggleBigPicture(); + } }); //------------------------------------------------ @@ -335,3 +339,38 @@ document.getElementById("show-controls").parentNode.style.bottom = "0px"; // Focus on the chat input //------------------------------------------------ document.querySelector("#chat-input textarea").focus(); + +//------------------------------------------------ +// Show enlarged character picture when the profile +// picture is clicked on +//------------------------------------------------ +let bigPictureVisible = false; + +function addBigPicture() { + var imgElement = document.createElement("img"); + var timestamp = new Date().getTime(); + imgElement.src = "/file/cache/pfp_character.png?time=" + timestamp; + imgElement.classList.add("bigProfilePicture"); + + var imgElementParent = document.getElementById("chat").parentNode.parentNode.parentNode.parentNode.parentNode.parentNode.parentNode; + imgElementParent.appendChild(imgElement); +} + +function deleteBigPicture() { + var bigProfilePictures = document.querySelectorAll('.bigProfilePicture'); + bigProfilePictures.forEach(function (element) { + element.parentNode.removeChild(element); + }); +} + +function toggleBigPicture() { + if(bigPictureVisible) { + deleteBigPicture(); + bigPictureVisible = false; + } else { + addBigPicture(); + bigPictureVisible = true; + } +} + +showBigPicture(); diff --git a/js/update_big_picture.js b/js/update_big_picture.js new file mode 100644 index 00000000..1984215a --- /dev/null +++ b/js/update_big_picture.js @@ -0,0 +1,7 @@ +function updateBigPicture() { + var existingElement = document.querySelector('.bigProfilePicture'); + if (existingElement) { + var timestamp = new Date().getTime(); + existingElement.src = "/file/cache/pfp_character.png?time=" + timestamp; + } +} diff --git a/modules/chat.py b/modules/chat.py index dda16749..436e3e43 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -544,9 +544,13 @@ def generate_pfp_cache(character): for path in [Path(f"characters/{character}.{extension}") for extension in ['png', 'jpg', 'jpeg']]: if path.exists(): - img = make_thumbnail(Image.open(path)) - img.save(Path('cache/pfp_character.png'), format='PNG') - return img + original_img = Image.open(path) + original_img.save(Path('cache/pfp_character.png'), format='PNG') + + thumb = make_thumbnail(original_img) + thumb.save(Path('cache/pfp_character_thumb.png'), format='PNG') + + return thumb return None @@ -575,8 +579,9 @@ def load_character(character, name1, name2, instruct=False): file_contents = open(filepath, 'r', encoding='utf-8').read() data = json.loads(file_contents) if extension == "json" else yaml.safe_load(file_contents) - if Path("cache/pfp_character.png").exists() and not instruct: - Path("cache/pfp_character.png").unlink() + for path in [Path("cache/pfp_character.png"), Path("cache/pfp_character_thumb.png")]: + if path.exists() and not instruct: + path.unlink() picture = generate_pfp_cache(character) diff --git a/modules/html_generator.py b/modules/html_generator.py index 26e47848..2a6509b3 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -225,7 +225,7 @@ def generate_cai_chat_html(history, name1, name2, style, reset_cache=False): output = f'
' # We use ?name2 and ?time.time() to force the browser to reset caches - img_bot = f'' if Path("cache/pfp_character.png").exists() else '' + img_bot = f'' if Path("cache/pfp_character_thumb.png").exists() else '' img_me = f'' if Path("cache/pfp_me.png").exists() else '' for i, _row in enumerate(history): diff --git a/modules/ui.py b/modules/ui.py index 383bc66f..9e2d6b6a 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -20,6 +20,8 @@ with open(Path(__file__).resolve().parent / '../js/switch_tabs.js', 'r') as f: switch_tabs_js = f.read() with open(Path(__file__).resolve().parent / '../js/show_controls.js', 'r') as f: show_controls_js = f.read() +with open(Path(__file__).resolve().parent / '../js/update_big_picture.js', 'r') as f: + update_big_picture_js = f.read() refresh_symbol = '🔄' delete_symbol = '🗑️' diff --git a/modules/ui_chat.py b/modules/ui_chat.py index b3cff3d6..40c8d71c 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -275,7 +275,8 @@ def create_event_handlers(): ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( chat.load_latest_history, gradio('interface_state'), gradio('history')).then( chat.redraw_html, gradio(reload_arr), gradio('display')).then( - lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')) + lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')).then( + lambda: None, None, None, _js=f'() => {{{ui.update_big_picture_js}; updateBigPicture()}}') shared.gradio['mode'].change( lambda x: gr.update(visible=x != 'instruct'), gradio('mode'), gradio('chat_style'), show_progress=False).then( From ab94f0d9bf0dddb8a7465bd40637f1e17e5dd832 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 18 Nov 2023 21:11:04 -0800 Subject: [PATCH 13/18] Minor style change --- modules/chat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/chat.py b/modules/chat.py index 436e3e43..22b5bf9a 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -96,6 +96,7 @@ def generate_chat_prompt(user_input, state, **kwargs): context_instruct = context_instruct.replace('<|system-message|>', state['custom_system_message']) else: context_instruct = context_instruct.replace('<|system-message|>', state['system_message']) + wrapper += context_instruct wrapper += all_substrings['instruct']['user_turn'].replace('<|user-message|>', command) wrapper += all_substrings['instruct']['bot_turn_stripped'] From a290d1738601cd21ad8ade5b770f220efdc15dac Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 19 Nov 2023 06:53:41 -0800 Subject: [PATCH 14/18] Add hover cursor to bot pfp --- css/main.css | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/css/main.css b/css/main.css index b06d809d..a3480fe0 100644 --- a/css/main.css +++ b/css/main.css @@ -659,6 +659,10 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { width: calc((100vw - 880px - 120px) /2); } +.pfp_character:hover { + cursor: pointer; +} + @media screen and (width <= 1300px) { .bigProfilePicture { display: none; From a6f1e1bcc51ce9b0db62f095f23d329166a6ce9a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 19 Nov 2023 07:55:25 -0800 Subject: [PATCH 15/18] Fix PEFT LoRA unloading --- modules/LoRA.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/modules/LoRA.py b/modules/LoRA.py index 4b119994..9c6edbf3 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -149,10 +149,7 @@ def add_lora_transformers(lora_names): # If any LoRA needs to be removed, start over if len(removed_set) > 0: - # shared.model may no longer be PeftModel - if hasattr(shared.model, 'disable_adapter'): - shared.model.disable_adapter() - shared.model = shared.model.base_model.model + shared.model = shared.model.unload() if len(lora_names) > 0: params = {} @@ -172,8 +169,6 @@ def add_lora_transformers(lora_names): if len(lora_names) > 1: merge_loras() - shared.lora_names = lora_names - if not shared.args.load_in_8bit and not shared.args.cpu: shared.model.half() if not hasattr(shared.model, "hf_device_map"): @@ -186,6 +181,8 @@ def add_lora_transformers(lora_names): else: shared.model = shared.model.cuda() + shared.lora_names = lora_names + def merge_loras(): if len(list({shared.model.peft_config[adapter].r for adapter in shared.model.peft_config.keys()})) > 1: From 78af3b0a008d47d549729d0ceb07223b14d002f4 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 19 Nov 2023 07:57:16 -0800 Subject: [PATCH 16/18] Update docs/What Works.md --- docs/What Works.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/What Works.md b/docs/What Works.md index 86936039..dba34a80 100644 --- a/docs/What Works.md +++ b/docs/What Works.md @@ -2,13 +2,13 @@ | Loader | Loading 1 LoRA | Loading 2 or more LoRAs | Training LoRAs | Multimodal extension | Perplexity evaluation | |----------------|----------------|-------------------------|----------------|----------------------|-----------------------| -| Transformers | ✅ | ✅ | ✅* | ✅ | ✅ | +| Transformers | ✅ | ✅*** | ✅* | ✅ | ✅ | | ExLlama_HF | ✅ | ❌ | ❌ | ❌ | ✅ | | ExLlamav2_HF | ✅ | ✅ | ❌ | ❌ | ✅ | | ExLlama | ✅ | ❌ | ❌ | ❌ | use ExLlama_HF | | ExLlamav2 | ✅ | ✅ | ❌ | ❌ | use ExLlamav2_HF | | AutoGPTQ | ✅ | ❌ | ❌ | ✅ | ✅ | -| GPTQ-for-LLaMa | ✅** | ✅ | ✅ | ✅ | ✅ | +| GPTQ-for-LLaMa | ✅** | ✅*** | ✅ | ✅ | ✅ | | llama.cpp | ❌ | ❌ | ❌ | ❌ | use llamacpp_HF | | llamacpp_HF | ❌ | ❌ | ❌ | ❌ | ✅ | | ctransformers | ❌ | ❌ | ❌ | ❌ | ❌ | @@ -21,3 +21,5 @@ \* Training LoRAs with GPTQ models also works with the Transformers loader. Make sure to check "auto-devices" and "disable_exllama" before loading the model. \*\* Requires the monkey-patch. The instructions can be found [here](https://github.com/oobabooga/text-generation-webui/wiki/08-%E2%80%90-Additional-Tips#using-loras-with-gptq-for-llama). + +\*\*\* Multi-LoRA in PEFT is tricky and the current implementation does not work reliably in all cases. From 9da7bb203d4011cd743f79d144c9c298d4d1b089 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 19 Nov 2023 07:59:29 -0800 Subject: [PATCH 17/18] Minor LoRA bug fix --- modules/LoRA.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/LoRA.py b/modules/LoRA.py index 9c6edbf3..dea476ad 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -145,6 +145,7 @@ def add_lora_transformers(lora_names): if len(lora_names) > 1: merge_loras() + shared.lora_names = lora_names return # If any LoRA needs to be removed, start over From 8cf05c1b31cca57653549c2cf67e52ca713c2d62 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 19 Nov 2023 08:31:01 -0800 Subject: [PATCH 18/18] Fix disappearing character gallery --- extensions/gallery/script.py | 14 ++++++++------ js/main.js | 1 - js/show_controls.js | 6 ++++++ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/extensions/gallery/script.py b/extensions/gallery/script.py index 611a11f4..efe96ba9 100644 --- a/extensions/gallery/script.py +++ b/extensions/gallery/script.py @@ -91,11 +91,13 @@ def ui(): with gr.Accordion("Character gallery", open=False, elem_id='gallery-extension'): update = gr.Button("Refresh") gr.HTML(value="") - gallery = gr.Dataset(components=[gr.HTML(visible=False)], - label="", - samples=generate_html(), - elem_classes=["character-gallery"], - samples_per_page=50 - ) + gallery = gr.Dataset( + components=[gr.HTML(visible=False)], + label="", + samples=generate_html(), + elem_classes=["character-gallery"], + samples_per_page=50 + ) + update.click(generate_html, [], gallery) gallery.select(select_character, None, gradio['character_menu']) diff --git a/js/main.js b/js/main.js index 2e0c765b..1e50e147 100644 --- a/js/main.js +++ b/js/main.js @@ -373,4 +373,3 @@ function toggleBigPicture() { } } -showBigPicture(); diff --git a/js/show_controls.js b/js/show_controls.js index 0173963b..0ba1ecde 100644 --- a/js/show_controls.js +++ b/js/show_controls.js @@ -10,6 +10,12 @@ function toggle_controls(value) { chatParent.classList.remove("bigchat"); document.getElementById("chat-input-row").classList.remove("bigchat"); document.getElementById("chat-col").classList.remove("bigchat"); + + let gallery_element = document.getElementById('gallery-extension'); + if (gallery_element) { + gallery_element.style.display = 'block'; + } + } else { belowChatInput.forEach(element => { element.style.display = "none";