From 8a6d9abb414e41333eebf3234b22677db6253626 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 6 Feb 2024 06:26:27 -0800 Subject: [PATCH] Small fixes --- docs/04 - Model Tab.md | 4 ++++ docs/What Works.md | 1 + modules/shared.py | 3 +-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/04 - Model Tab.md b/docs/04 - Model Tab.md index 2585f544..762f85e8 100644 --- a/docs/04 - Model Tab.md +++ b/docs/04 - Model Tab.md @@ -47,6 +47,10 @@ Examples: * **no_flash_attn**: Disables flash attention. Otherwise, it is automatically used as long as the library is installed. * **cache_8bit**: Create a 8-bit precision cache instead of a 16-bit one. This saves VRAM but increases perplexity (I don't know by how much). +### ExLlamav2 + +The same as ExLlamav2_HF but using the internal samplers of ExLlamav2 instead of the ones in the Transformers library. + ### AutoGPTQ Loads: GPTQ models. diff --git a/docs/What Works.md b/docs/What Works.md index 343343a1..354da1dd 100644 --- a/docs/What Works.md +++ b/docs/What Works.md @@ -6,6 +6,7 @@ | llama.cpp | ❌ | ❌ | ❌ | ❌ | use llamacpp_HF | | llamacpp_HF | ❌ | ❌ | ❌ | ❌ | ✅ | | ExLlamav2_HF | ✅ | ✅ | ❌ | ❌ | ✅ | +| ExLlamav2 | ✅ | ✅ | ❌ | ❌ | use ExLlamav2_HF | | AutoGPTQ | ✅ | ❌ | ❌ | ✅ | ✅ | | AutoAWQ | ? | ❌ | ? | ? | ✅ | | GPTQ-for-LLaMa | ✅\*\* | ✅\*\*\* | ✅ | ✅ | ✅ | diff --git a/modules/shared.py b/modules/shared.py index 78966617..5c81c1c7 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -50,7 +50,6 @@ settings = { 'prompt_lookup_num_tokens': 0, 'custom_stopping_strings': '', 'custom_token_bans': '', - 'sampler_priority': 'temperature,top_k,top_p,typical_p,epsilon_cutoff,eta_cutoff,tfs,top_a,min_p,dynamic_temperature,quadratic_sampling,mirostat', 'auto_max_new_tokens': False, 'ban_eos_token': False, 'add_bos_token': True, @@ -130,7 +129,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc group.add_argument('--logits_all', action='store_true', help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.') group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') group.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.') -group.add_argument('--row_split', action='store_true', help='Split multi-gpu by row instead of layer. Faster on some cards.') +group.add_argument('--row_split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.') # ExLlama group = parser.add_argument_group('ExLlama')