mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-10-01 01:26:03 -04:00
Set use_fast=True by default, create --no_use_fast flag
This increases tokens/second for HF loaders.
This commit is contained in:
parent
b2ce8dc7ee
commit
8b66d83aa9
@ -28,7 +28,7 @@ Options:
|
||||
* **disk**: Enable disk offloading for layers that don't fit into the GPU and CPU combined.
|
||||
* **load-in-4bit**: Load the model in 4-bit precision using bitsandbytes.
|
||||
* **trust-remote-code**: Some models use custom Python code to load the model or the tokenizer. For such models, this option needs to be set. It doesn't download any remote content: all it does is execute the .py files that get downloaded with the model. Those files can potentially include malicious code; I have never seen it happen, but it is in principle possible.
|
||||
* **use_fast**: Use the "fast" version of the tokenizer. Especially useful for Llama models, which originally had a "slow" tokenizer that received an update. If your local files are in the old "slow" format, checking this option may trigger a conversion that takes several minutes. The fast tokenizer is mostly useful if you are generating 50+ tokens/second using ExLlama_HF or if you are tokenizing a huge dataset for training.
|
||||
* **no_use_fast**: Do not use the "fast" version of the tokenizer. Can usually be ignored; only check this if you can't load the tokenizer for your model otherwise.
|
||||
* **use_flash_attention_2**: Set use_flash_attention_2=True while loading the model. Possibly useful for training.
|
||||
* **disable_exllama**: Only applies when you are loading a GPTQ model through the transformers loader. It needs to be checked if you intend to train LoRAs with the model.
|
||||
|
||||
|
@ -19,7 +19,7 @@ loaders_and_params = OrderedDict({
|
||||
'quant_type',
|
||||
'compute_dtype',
|
||||
'trust_remote_code',
|
||||
'use_fast',
|
||||
'no_use_fast',
|
||||
'use_flash_attention_2',
|
||||
'alpha_value',
|
||||
'rope_freq_base',
|
||||
@ -34,7 +34,7 @@ loaders_and_params = OrderedDict({
|
||||
'rope_freq_base',
|
||||
'compress_pos_emb',
|
||||
'cfg_cache',
|
||||
'use_fast',
|
||||
'no_use_fast',
|
||||
'exllama_HF_info',
|
||||
],
|
||||
'ExLlamav2_HF': [
|
||||
@ -45,7 +45,7 @@ loaders_and_params = OrderedDict({
|
||||
'cache_8bit',
|
||||
'alpha_value',
|
||||
'compress_pos_emb',
|
||||
'use_fast',
|
||||
'no_use_fast',
|
||||
],
|
||||
'ExLlama': [
|
||||
'gpu_split',
|
||||
@ -78,7 +78,7 @@ loaders_and_params = OrderedDict({
|
||||
'disk',
|
||||
'auto_devices',
|
||||
'trust_remote_code',
|
||||
'use_fast',
|
||||
'no_use_fast',
|
||||
'autogptq_info',
|
||||
],
|
||||
'GPTQ-for-LLaMa': [
|
||||
@ -86,7 +86,7 @@ loaders_and_params = OrderedDict({
|
||||
'groupsize',
|
||||
'model_type',
|
||||
'pre_layer',
|
||||
'use_fast',
|
||||
'no_use_fast',
|
||||
'gptq_for_llama_info',
|
||||
],
|
||||
'llama.cpp': [
|
||||
@ -119,7 +119,7 @@ loaders_and_params = OrderedDict({
|
||||
'compress_pos_emb',
|
||||
'numa',
|
||||
'cfg_cache',
|
||||
'use_fast',
|
||||
'no_use_fast',
|
||||
'logits_all',
|
||||
'llamacpp_HF_info',
|
||||
],
|
||||
@ -139,7 +139,7 @@ loaders_and_params = OrderedDict({
|
||||
'max_seq_len',
|
||||
'no_inject_fused_attention',
|
||||
'trust_remote_code',
|
||||
'use_fast',
|
||||
'no_use_fast',
|
||||
]
|
||||
})
|
||||
|
||||
|
@ -114,13 +114,13 @@ def load_tokenizer(model_name, model):
|
||||
if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
|
||||
tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
|
||||
elif path_to_model.exists():
|
||||
if shared.args.use_fast:
|
||||
logger.info('Loading the tokenizer with use_fast=True.')
|
||||
if shared.args.no_use_fast:
|
||||
logger.info('Loading the tokenizer with use_fast=False.')
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
path_to_model,
|
||||
trust_remote_code=shared.args.trust_remote_code,
|
||||
use_fast=shared.args.use_fast
|
||||
use_fast=not shared.args.no_use_fast
|
||||
)
|
||||
|
||||
return tokenizer
|
||||
@ -262,13 +262,13 @@ def llamacpp_HF_loader(model_name):
|
||||
logger.error("Could not load the model because a tokenizer in transformers format was not found. Please download oobabooga/llama-tokenizer.")
|
||||
return None, None
|
||||
|
||||
if shared.args.use_fast:
|
||||
logger.info('Loading the tokenizer with use_fast=True.')
|
||||
if shared.args.no_use_fast:
|
||||
logger.info('Loading the tokenizer with use_fast=False.')
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
path,
|
||||
trust_remote_code=shared.args.trust_remote_code,
|
||||
use_fast=shared.args.use_fast
|
||||
use_fast=not shared.args.no_use_fast
|
||||
)
|
||||
|
||||
model = LlamacppHF.from_pretrained(model_name)
|
||||
|
@ -93,7 +93,7 @@ parser.add_argument('--xformers', action='store_true', help='Use xformer\'s memo
|
||||
parser.add_argument('--sdp-attention', action='store_true', help='Use PyTorch 2.0\'s SDP attention. Same as above.')
|
||||
parser.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
|
||||
parser.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
|
||||
parser.add_argument('--use_fast', action='store_true', help='Set use_fast=True while loading the tokenizer.')
|
||||
parser.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Set this if you have any problems related to use_fast.')
|
||||
parser.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.')
|
||||
|
||||
# Accelerate 4-bit
|
||||
@ -182,6 +182,7 @@ parser.add_argument('--mul_mat_q', action='store_true', help='DEPRECATED')
|
||||
parser.add_argument('--api-blocking-port', type=int, default=5000, help='DEPRECATED')
|
||||
parser.add_argument('--api-streaming-port', type=int, default=5005, help='DEPRECATED')
|
||||
parser.add_argument('--llama_cpp_seed', type=int, default=0, help='DEPRECATED')
|
||||
parser.add_argument('--use_fast', action='store_true', help='DEPRECATED')
|
||||
|
||||
args = parser.parse_args()
|
||||
args_defaults = parser.parse_args([])
|
||||
@ -192,7 +193,7 @@ for arg in sys.argv[1:]:
|
||||
provided_arguments.append(arg)
|
||||
|
||||
# Deprecation warnings
|
||||
for k in ['chat', 'notebook', 'no_stream', 'mul_mat_q']:
|
||||
for k in ['notebook', 'chat', 'no_stream', 'mul_mat_q', 'use_fast']:
|
||||
if getattr(args, k):
|
||||
logger.warning(f'The --{k} flag has been deprecated and will be removed soon. Please remove that flag.')
|
||||
|
||||
|
@ -52,7 +52,7 @@ def list_model_elements():
|
||||
'bf16',
|
||||
'load_in_8bit',
|
||||
'trust_remote_code',
|
||||
'use_fast',
|
||||
'no_use_fast',
|
||||
'use_flash_attention_2',
|
||||
'load_in_4bit',
|
||||
'compute_dtype',
|
||||
|
@ -109,7 +109,6 @@ def create_ui():
|
||||
shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
|
||||
shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
|
||||
shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.')
|
||||
shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.')
|
||||
shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
|
||||
shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
|
||||
shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
|
||||
@ -122,12 +121,13 @@ def create_ui():
|
||||
shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
|
||||
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
|
||||
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.', interactive=shared.args.trust_remote_code)
|
||||
shared.gradio['use_fast'] = gr.Checkbox(label="use_fast", value=shared.args.use_fast, info='Set use_fast=True while loading the tokenizer. May trigger a conversion that takes several minutes.')
|
||||
shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.')
|
||||
shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
|
||||
shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
|
||||
shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.')
|
||||
shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.')
|
||||
shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.')
|
||||
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
|
||||
shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
|
||||
shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/wiki/04-%E2%80%90-Model-Tab#exllama_hf).')
|
||||
shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')
|
||||
|
Loading…
Reference in New Issue
Block a user