Set use_fast=True by default, create --no_use_fast flag

This increases tokens/second for HF loaders.
2024-10-01 01:26:03 -04:00 · 2023-11-16 19:45:05 -08:00 · 2023-11-16 19:45:05 -08:00 · 8b66d83aa9
commit 8b66d83aa9
parent b2ce8dc7ee
6 changed files with 20 additions and 19 deletions
--- a/docs/04
+++ b/docs/04
@ -28,7 +28,7 @@ Options:
 * **disk**: Enable disk offloading for layers that don't fit into the GPU and CPU combined.
 * **load-in-4bit**: Load the model in 4-bit precision using bitsandbytes.
 * **trust-remote-code**: Some models use custom Python code to load the model or the tokenizer. For such models, this option needs to be set. It doesn't download any remote content: all it does is execute the .py files that get downloaded with the model. Those files can potentially include malicious code; I have never seen it happen, but it is in principle possible.
-* **use_fast**: Use the "fast" version of the tokenizer. Especially useful for Llama models, which originally had a "slow" tokenizer that received an update. If your local files are in the old "slow" format, checking this option may trigger a conversion that takes several minutes. The fast tokenizer is mostly useful if you are generating 50+ tokens/second using ExLlama_HF or if you are tokenizing a huge dataset for training.
+* **no_use_fast**: Do not use the "fast" version of the tokenizer. Can usually be ignored; only check this if you can't load the tokenizer for your model otherwise.
 * **use_flash_attention_2**: Set use_flash_attention_2=True while loading the model. Possibly useful for training.
 * **disable_exllama**: Only applies when you are loading a GPTQ model through the transformers loader. It needs to be checked if you intend to train LoRAs with the model.

--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -19,7 +19,7 @@ loaders_and_params = OrderedDict({
        'quant_type',
        'compute_dtype',
        'trust_remote_code',
-        'use_fast',
+        'no_use_fast',
        'use_flash_attention_2',
        'alpha_value',
        'rope_freq_base',
@ -34,7 +34,7 @@ loaders_and_params = OrderedDict({
        'rope_freq_base',
        'compress_pos_emb',
        'cfg_cache',
-        'use_fast',
+        'no_use_fast',
        'exllama_HF_info',
    ],
    'ExLlamav2_HF': [
@ -45,7 +45,7 @@ loaders_and_params = OrderedDict({
        'cache_8bit',
        'alpha_value',
        'compress_pos_emb',
-        'use_fast',
+        'no_use_fast',
    ],
    'ExLlama': [
        'gpu_split',
@ -78,7 +78,7 @@ loaders_and_params = OrderedDict({
        'disk',
        'auto_devices',
        'trust_remote_code',
-        'use_fast',
+        'no_use_fast',
        'autogptq_info',
    ],
    'GPTQ-for-LLaMa': [
@ -86,7 +86,7 @@ loaders_and_params = OrderedDict({
        'groupsize',
        'model_type',
        'pre_layer',
-        'use_fast',
+        'no_use_fast',
        'gptq_for_llama_info',
    ],
    'llama.cpp': [
@ -119,7 +119,7 @@ loaders_and_params = OrderedDict({
        'compress_pos_emb',
        'numa',
        'cfg_cache',
-        'use_fast',
+        'no_use_fast',
        'logits_all',
        'llamacpp_HF_info',
    ],
@ -139,7 +139,7 @@ loaders_and_params = OrderedDict({
        'max_seq_len',
        'no_inject_fused_attention',
        'trust_remote_code',
-        'use_fast',
+        'no_use_fast',
    ]
 })

--- a/modules/models.py
+++ b/modules/models.py
@ -114,13 +114,13 @@ def load_tokenizer(model_name, model):
    if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
        tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
    elif path_to_model.exists():
-        if shared.args.use_fast:
-            logger.info('Loading the tokenizer with use_fast=True.')
+        if shared.args.no_use_fast:
+            logger.info('Loading the tokenizer with use_fast=False.')

        tokenizer = AutoTokenizer.from_pretrained(
            path_to_model,
            trust_remote_code=shared.args.trust_remote_code,
-            use_fast=shared.args.use_fast
+            use_fast=not shared.args.no_use_fast
        )

    return tokenizer
@ -262,13 +262,13 @@ def llamacpp_HF_loader(model_name):
        logger.error("Could not load the model because a tokenizer in transformers format was not found. Please download oobabooga/llama-tokenizer.")
        return None, None

-    if shared.args.use_fast:
-        logger.info('Loading the tokenizer with use_fast=True.')
+    if shared.args.no_use_fast:
+        logger.info('Loading the tokenizer with use_fast=False.')

    tokenizer = AutoTokenizer.from_pretrained(
        path,
        trust_remote_code=shared.args.trust_remote_code,
-        use_fast=shared.args.use_fast
+        use_fast=not shared.args.no_use_fast
    )

    model = LlamacppHF.from_pretrained(model_name)
--- a/modules/shared.py
+++ b/modules/shared.py
@ -93,7 +93,7 @@ parser.add_argument('--xformers', action='store_true', help='Use xformer\'s memo
 parser.add_argument('--sdp-attention', action='store_true', help='Use PyTorch 2.0\'s SDP attention. Same as above.')
 parser.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
 parser.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
-parser.add_argument('--use_fast', action='store_true', help='Set use_fast=True while loading the tokenizer.')
+parser.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Set this if you have any problems related to use_fast.')
 parser.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.')

 # Accelerate 4-bit
@ -182,6 +182,7 @@ parser.add_argument('--mul_mat_q', action='store_true', help='DEPRECATED')
 parser.add_argument('--api-blocking-port', type=int, default=5000, help='DEPRECATED')
 parser.add_argument('--api-streaming-port', type=int, default=5005, help='DEPRECATED')
 parser.add_argument('--llama_cpp_seed', type=int, default=0, help='DEPRECATED')
+parser.add_argument('--use_fast', action='store_true', help='DEPRECATED')

 args = parser.parse_args()
 args_defaults = parser.parse_args([])
@ -192,7 +193,7 @@ for arg in sys.argv[1:]:
        provided_arguments.append(arg)

 # Deprecation warnings
-for k in ['chat', 'notebook', 'no_stream', 'mul_mat_q']:
+for k in ['notebook', 'chat', 'no_stream', 'mul_mat_q', 'use_fast']:
    if getattr(args, k):
        logger.warning(f'The --{k} flag has been deprecated and will be removed soon. Please remove that flag.')

--- a/modules/ui.py
+++ b/modules/ui.py
@ -52,7 +52,7 @@ def list_model_elements():
        'bf16',
        'load_in_8bit',
        'trust_remote_code',
-        'use_fast',
+        'no_use_fast',
        'use_flash_attention_2',
        'load_in_4bit',
        'compute_dtype',
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -109,7 +109,6 @@ def create_ui():
                            shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
                            shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
                            shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.')
-                            shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.')
                            shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
                            shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
                            shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
@ -122,12 +121,13 @@ def create_ui():
                            shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
                            shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
                            shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.', interactive=shared.args.trust_remote_code)
-                            shared.gradio['use_fast'] = gr.Checkbox(label="use_fast", value=shared.args.use_fast, info='Set use_fast=True while loading the tokenizer. May trigger a conversion that takes several minutes.')
+                            shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.')
                            shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
                            shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
                            shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.')
                            shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.')
                            shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.')
+                            shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
                            shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
                            shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/wiki/04-%E2%80%90-Model-Tab#exllama_hf).')
                            shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')