mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-10-01 01:26:03 -04:00
Add --use_fast option (closes #3741)
This commit is contained in:
parent
b973b91d73
commit
d0d221df49
@ -269,6 +269,7 @@ Optionally, you can use the following command-line flags:
|
|||||||
| `--xformers` | Use xformer's memory efficient attention. This should increase your tokens/s. |
|
| `--xformers` | Use xformer's memory efficient attention. This should increase your tokens/s. |
|
||||||
| `--sdp-attention` | Use torch 2.0's sdp attention. |
|
| `--sdp-attention` | Use torch 2.0's sdp attention. |
|
||||||
| `--trust-remote-code` | Set trust_remote_code=True while loading a model. Necessary for ChatGLM and Falcon. |
|
| `--trust-remote-code` | Set trust_remote_code=True while loading a model. Necessary for ChatGLM and Falcon. |
|
||||||
|
| `--use_fast` | Set use_fast=True while loading a tokenizer. |
|
||||||
|
|
||||||
#### Accelerate 4-bit
|
#### Accelerate 4-bit
|
||||||
|
|
||||||
|
@ -20,6 +20,7 @@ loaders_and_params = OrderedDict({
|
|||||||
'quant_type',
|
'quant_type',
|
||||||
'compute_dtype',
|
'compute_dtype',
|
||||||
'trust_remote_code',
|
'trust_remote_code',
|
||||||
|
'use_fast',
|
||||||
'alpha_value',
|
'alpha_value',
|
||||||
'rope_freq_base',
|
'rope_freq_base',
|
||||||
'compress_pos_emb',
|
'compress_pos_emb',
|
||||||
@ -33,6 +34,7 @@ loaders_and_params = OrderedDict({
|
|||||||
'rope_freq_base',
|
'rope_freq_base',
|
||||||
'compress_pos_emb',
|
'compress_pos_emb',
|
||||||
'cfg_cache',
|
'cfg_cache',
|
||||||
|
'use_fast',
|
||||||
'exllama_HF_info',
|
'exllama_HF_info',
|
||||||
],
|
],
|
||||||
'ExLlamav2_HF': [
|
'ExLlamav2_HF': [
|
||||||
@ -41,6 +43,7 @@ loaders_and_params = OrderedDict({
|
|||||||
'cfg_cache',
|
'cfg_cache',
|
||||||
'alpha_value',
|
'alpha_value',
|
||||||
'compress_pos_emb',
|
'compress_pos_emb',
|
||||||
|
'use_fast',
|
||||||
],
|
],
|
||||||
'ExLlama': [
|
'ExLlama': [
|
||||||
'gpu_split',
|
'gpu_split',
|
||||||
@ -71,6 +74,7 @@ loaders_and_params = OrderedDict({
|
|||||||
'disk',
|
'disk',
|
||||||
'auto_devices',
|
'auto_devices',
|
||||||
'trust_remote_code',
|
'trust_remote_code',
|
||||||
|
'use_fast',
|
||||||
'autogptq_info',
|
'autogptq_info',
|
||||||
],
|
],
|
||||||
'GPTQ-for-LLaMa': [
|
'GPTQ-for-LLaMa': [
|
||||||
@ -78,6 +82,7 @@ loaders_and_params = OrderedDict({
|
|||||||
'groupsize',
|
'groupsize',
|
||||||
'model_type',
|
'model_type',
|
||||||
'pre_layer',
|
'pre_layer',
|
||||||
|
'use_fast',
|
||||||
'gptq_for_llama_info',
|
'gptq_for_llama_info',
|
||||||
],
|
],
|
||||||
'llama.cpp': [
|
'llama.cpp': [
|
||||||
@ -111,6 +116,7 @@ loaders_and_params = OrderedDict({
|
|||||||
'compress_pos_emb',
|
'compress_pos_emb',
|
||||||
'cpu',
|
'cpu',
|
||||||
'cfg_cache',
|
'cfg_cache',
|
||||||
|
'use_fast',
|
||||||
'llamacpp_HF_info',
|
'llamacpp_HF_info',
|
||||||
],
|
],
|
||||||
'ctransformers': [
|
'ctransformers': [
|
||||||
|
@ -99,18 +99,14 @@ def load_tokenizer(model_name, model):
|
|||||||
if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
|
if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
|
||||||
tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
|
tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
|
||||||
elif path_to_model.exists():
|
elif path_to_model.exists():
|
||||||
try:
|
if shared.args.use_fast:
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
logger.info('Loading the tokenizer with use_fast=True.')
|
||||||
path_to_model,
|
|
||||||
trust_remote_code=shared.args.trust_remote_code,
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
use_fast=False
|
path_to_model,
|
||||||
)
|
trust_remote_code=shared.args.trust_remote_code,
|
||||||
except ValueError:
|
use_fast=shared.args.use_fast
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
)
|
||||||
path_to_model,
|
|
||||||
trust_remote_code=shared.args.trust_remote_code,
|
|
||||||
use_fast=True
|
|
||||||
)
|
|
||||||
|
|
||||||
return tokenizer
|
return tokenizer
|
||||||
|
|
||||||
@ -249,10 +245,13 @@ def llamacpp_HF_loader(model_name):
|
|||||||
logger.error("Could not load the model because a tokenizer in transformers format was not found. Please download oobabooga/llama-tokenizer.")
|
logger.error("Could not load the model because a tokenizer in transformers format was not found. Please download oobabooga/llama-tokenizer.")
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
|
if shared.args.use_fast:
|
||||||
|
logger.info('Loading the tokenizer with use_fast=True.')
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
path,
|
path,
|
||||||
trust_remote_code=shared.args.trust_remote_code,
|
trust_remote_code=shared.args.trust_remote_code,
|
||||||
use_fast=False
|
use_fast=shared.args.use_fast
|
||||||
)
|
)
|
||||||
|
|
||||||
model = LlamacppHF.from_pretrained(model_name)
|
model = LlamacppHF.from_pretrained(model_name)
|
||||||
|
@ -105,6 +105,7 @@ parser.add_argument('--no-cache', action='store_true', help='Set use_cache to Fa
|
|||||||
parser.add_argument('--xformers', action='store_true', help="Use xformer's memory efficient attention. This should increase your tokens/s.")
|
parser.add_argument('--xformers', action='store_true', help="Use xformer's memory efficient attention. This should increase your tokens/s.")
|
||||||
parser.add_argument('--sdp-attention', action='store_true', help="Use torch 2.0's sdp attention.")
|
parser.add_argument('--sdp-attention', action='store_true', help="Use torch 2.0's sdp attention.")
|
||||||
parser.add_argument('--trust-remote-code', action='store_true', help="Set trust_remote_code=True while loading a model. Necessary for ChatGLM and Falcon.")
|
parser.add_argument('--trust-remote-code', action='store_true', help="Set trust_remote_code=True while loading a model. Necessary for ChatGLM and Falcon.")
|
||||||
|
parser.add_argument('--use_fast', action='store_true', help="Set use_fast=True while loading a tokenizer.")
|
||||||
|
|
||||||
# Accelerate 4-bit
|
# Accelerate 4-bit
|
||||||
parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes).')
|
parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes).')
|
||||||
|
@ -52,6 +52,7 @@ def list_model_elements():
|
|||||||
'bf16',
|
'bf16',
|
||||||
'load_in_8bit',
|
'load_in_8bit',
|
||||||
'trust_remote_code',
|
'trust_remote_code',
|
||||||
|
'use_fast',
|
||||||
'load_in_4bit',
|
'load_in_4bit',
|
||||||
'compute_dtype',
|
'compute_dtype',
|
||||||
'quant_type',
|
'quant_type',
|
||||||
|
@ -115,6 +115,7 @@ def create_ui():
|
|||||||
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
|
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
|
||||||
shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
|
shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
|
||||||
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
|
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
|
||||||
|
shared.gradio['use_fast'] = gr.Checkbox(label="use_fast", value=shared.args.use_fast, info='Set use_fast=True while loading the tokenizer. May trigger a conversion that takes several minutes.')
|
||||||
shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.')
|
shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.')
|
||||||
shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
|
shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
|
||||||
shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')
|
shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')
|
||||||
|
Loading…
Reference in New Issue
Block a user