mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-10-01 01:26:03 -04:00
Add Support for Static NTK RoPE scaling for exllama/exllama_hf (#2955)
This commit is contained in:
parent
1610d5ffb2
commit
10c8c197bf
@ -269,6 +269,7 @@ Optionally, you can use the following command-line flags:
|
|||||||
|`--gpu-split` | Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. `20,7,7` |
|
|`--gpu-split` | Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. `20,7,7` |
|
||||||
|`--max_seq_len MAX_SEQ_LEN` | Maximum sequence length. |
|
|`--max_seq_len MAX_SEQ_LEN` | Maximum sequence length. |
|
||||||
|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
|
|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
|
||||||
|
|`--alpha_value ALPHA_VALUE` | Positional embeddings alpha factor for NTK RoPE scaling. Same as above. Use either this or compress_pos_emb, not both. `
|
||||||
|
|
||||||
#### GPTQ-for-LLaMa
|
#### GPTQ-for-LLaMa
|
||||||
|
|
||||||
|
@ -53,13 +53,17 @@ class ExllamaModel:
|
|||||||
if shared.args.gpu_split:
|
if shared.args.gpu_split:
|
||||||
config.set_auto_map(shared.args.gpu_split)
|
config.set_auto_map(shared.args.gpu_split)
|
||||||
config.gpu_peer_fix = True
|
config.gpu_peer_fix = True
|
||||||
|
|
||||||
|
if shared.args.alpha_value:
|
||||||
|
config.alpha_value = shared.args.alpha_value
|
||||||
|
config.calculate_rotary_embedding_base()
|
||||||
|
|
||||||
if torch_version.hip:
|
if torch_version.hip:
|
||||||
config.rmsnorm_no_half2 = True
|
config.rmsnorm_no_half2 = True
|
||||||
config.rope_no_half2 = True
|
config.rope_no_half2 = True
|
||||||
config.matmul_no_half2 = True
|
config.matmul_no_half2 = True
|
||||||
config.silu_no_half2 = True
|
config.silu_no_half2 = True
|
||||||
|
|
||||||
|
|
||||||
model = ExLlama(config)
|
model = ExLlama(config)
|
||||||
tokenizer = ExLlamaTokenizer(str(tokenizer_model_path))
|
tokenizer = ExLlamaTokenizer(str(tokenizer_model_path))
|
||||||
cache = ExLlamaCache(model)
|
cache = ExLlamaCache(model)
|
||||||
|
@ -97,6 +97,11 @@ class ExllamaHF(PreTrainedModel):
|
|||||||
if shared.args.gpu_split:
|
if shared.args.gpu_split:
|
||||||
config.set_auto_map(shared.args.gpu_split)
|
config.set_auto_map(shared.args.gpu_split)
|
||||||
config.gpu_peer_fix = True
|
config.gpu_peer_fix = True
|
||||||
|
|
||||||
|
if shared.args.alpha_value:
|
||||||
|
config.alpha_value = shared.args.alpha_value
|
||||||
|
config.calculate_rotary_embedding_base()
|
||||||
|
|
||||||
if torch.version.hip:
|
if torch.version.hip:
|
||||||
config.rmsnorm_no_half2 = True
|
config.rmsnorm_no_half2 = True
|
||||||
config.rope_no_half2 = True
|
config.rope_no_half2 = True
|
||||||
|
@ -57,12 +57,14 @@ loaders_and_params = {
|
|||||||
'gpu_split',
|
'gpu_split',
|
||||||
'max_seq_len',
|
'max_seq_len',
|
||||||
'compress_pos_emb',
|
'compress_pos_emb',
|
||||||
|
'alpha_value',
|
||||||
'exllama_info',
|
'exllama_info',
|
||||||
],
|
],
|
||||||
'ExLlama_HF' : [
|
'ExLlama_HF' : [
|
||||||
'gpu_split',
|
'gpu_split',
|
||||||
'max_seq_len',
|
'max_seq_len',
|
||||||
'compress_pos_emb',
|
'compress_pos_emb',
|
||||||
|
'alpha_value',
|
||||||
'exllama_HF_info',
|
'exllama_HF_info',
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@ -150,6 +150,7 @@ parser.add_argument('--desc_act', action='store_true', help='For models that don
|
|||||||
parser.add_argument('--gpu-split', type=str, help="Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. 20,7,7")
|
parser.add_argument('--gpu-split', type=str, help="Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. 20,7,7")
|
||||||
parser.add_argument('--max_seq_len', type=int, default=2048, help="Maximum sequence length.")
|
parser.add_argument('--max_seq_len', type=int, default=2048, help="Maximum sequence length.")
|
||||||
parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.")
|
parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.")
|
||||||
|
parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Same as above. Use either this or compress_pos_emb, not both.")
|
||||||
|
|
||||||
# FlexGen
|
# FlexGen
|
||||||
parser.add_argument('--flexgen', action='store_true', help='DEPRECATED')
|
parser.add_argument('--flexgen', action='store_true', help='DEPRECATED')
|
||||||
|
@ -63,9 +63,11 @@ def list_model_elements():
|
|||||||
'llama_cpp_seed',
|
'llama_cpp_seed',
|
||||||
'gpu_split',
|
'gpu_split',
|
||||||
'max_seq_len',
|
'max_seq_len',
|
||||||
'compress_pos_emb'
|
'compress_pos_emb',
|
||||||
|
'alpha_value'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
for i in range(torch.cuda.device_count()):
|
for i in range(torch.cuda.device_count()):
|
||||||
elements.append(f'gpu_memory_{i}')
|
elements.append(f'gpu_memory_{i}')
|
||||||
|
|
||||||
|
@ -226,6 +226,7 @@ def create_model_menus():
|
|||||||
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
|
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
|
||||||
shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=2048, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
|
shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=2048, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len)
|
||||||
shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.', value=shared.args.compress_pos_emb)
|
shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.', value=shared.args.compress_pos_emb)
|
||||||
|
shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=1, info='Positional embeddings alpha factor for NTK RoPE scaling. Same as above. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
|
||||||
|
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
|
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
|
||||||
|
Loading…
Reference in New Issue
Block a user