2023-08-06 20:49:27 -04:00
import importlib
import math
import re
import traceback
from functools import partial
2023-08-29 22:32:36 -04:00
from pathlib import Path
2023-08-06 20:49:27 -04:00
import gradio as gr
import psutil
import torch
from modules import loaders , shared , ui , utils
from modules . logging_colors import logger
from modules . LoRA import add_lora_to_model
from modules . models import load_model , unload_model
from modules . models_settings import (
apply_model_settings_to_state ,
2023-09-11 17:49:30 -04:00
get_model_metadata ,
2023-08-06 20:49:27 -04:00
save_model_settings ,
update_model_parameters
)
from modules . utils import gradio
def create_ui ( ) :
# Finding the default values for the GPU and CPU memories
total_mem = [ ]
for i in range ( torch . cuda . device_count ( ) ) :
total_mem . append ( math . floor ( torch . cuda . get_device_properties ( i ) . total_memory / ( 1024 * 1024 ) ) )
default_gpu_mem = [ ]
if shared . args . gpu_memory is not None and len ( shared . args . gpu_memory ) > 0 :
for i in shared . args . gpu_memory :
if ' mib ' in i . lower ( ) :
default_gpu_mem . append ( int ( re . sub ( ' [a-zA-Z ] ' , ' ' , i ) ) )
else :
default_gpu_mem . append ( int ( re . sub ( ' [a-zA-Z ] ' , ' ' , i ) ) * 1000 )
2023-08-18 11:03:34 -04:00
2023-08-06 20:49:27 -04:00
while len ( default_gpu_mem ) < len ( total_mem ) :
default_gpu_mem . append ( 0 )
total_cpu_mem = math . floor ( psutil . virtual_memory ( ) . total / ( 1024 * 1024 ) )
if shared . args . cpu_memory is not None :
default_cpu_mem = re . sub ( ' [a-zA-Z ] ' , ' ' , shared . args . cpu_memory )
else :
default_cpu_mem = 0
with gr . Tab ( " Model " , elem_id = " model-tab " ) :
with gr . Row ( ) :
with gr . Column ( ) :
with gr . Row ( ) :
with gr . Column ( ) :
with gr . Row ( ) :
shared . gradio [ ' model_menu ' ] = gr . Dropdown ( choices = utils . get_available_models ( ) , value = shared . model_name , label = ' Model ' , elem_classes = ' slim-dropdown ' )
ui . create_refresh_button ( shared . gradio [ ' model_menu ' ] , lambda : None , lambda : { ' choices ' : utils . get_available_models ( ) } , ' refresh-button ' )
shared . gradio [ ' load_model ' ] = gr . Button ( " Load " , visible = not shared . settings [ ' autoload_model ' ] , elem_classes = ' refresh-button ' )
shared . gradio [ ' unload_model ' ] = gr . Button ( " Unload " , elem_classes = ' refresh-button ' )
shared . gradio [ ' reload_model ' ] = gr . Button ( " Reload " , elem_classes = ' refresh-button ' )
shared . gradio [ ' save_model_settings ' ] = gr . Button ( " Save settings " , elem_classes = ' refresh-button ' )
with gr . Column ( ) :
with gr . Row ( ) :
shared . gradio [ ' lora_menu ' ] = gr . Dropdown ( multiselect = True , choices = utils . get_available_loras ( ) , value = shared . lora_names , label = ' LoRA(s) ' , elem_classes = ' slim-dropdown ' )
ui . create_refresh_button ( shared . gradio [ ' lora_menu ' ] , lambda : None , lambda : { ' choices ' : utils . get_available_loras ( ) , ' value ' : shared . lora_names } , ' refresh-button ' )
shared . gradio [ ' lora_menu_apply ' ] = gr . Button ( value = ' Apply LoRAs ' , elem_classes = ' refresh-button ' )
with gr . Row ( ) :
with gr . Column ( ) :
2023-08-11 13:41:33 -04:00
shared . gradio [ ' loader ' ] = gr . Dropdown ( label = " Model loader " , choices = loaders . loaders_and_params . keys ( ) , value = None )
2023-08-06 20:49:27 -04:00
with gr . Box ( ) :
with gr . Row ( ) :
with gr . Column ( ) :
for i in range ( len ( total_mem ) ) :
shared . gradio [ f ' gpu_memory_ { i } ' ] = gr . Slider ( label = f " gpu-memory in MiB for device : { i } " , maximum = total_mem [ i ] , value = default_gpu_mem [ i ] )
shared . gradio [ ' cpu_memory ' ] = gr . Slider ( label = " cpu-memory in MiB " , maximum = total_cpu_mem , value = default_cpu_mem )
shared . gradio [ ' transformers_info ' ] = gr . Markdown ( ' load-in-4bit params: ' )
shared . gradio [ ' compute_dtype ' ] = gr . Dropdown ( label = " compute_dtype " , choices = [ " bfloat16 " , " float16 " , " float32 " ] , value = shared . args . compute_dtype )
shared . gradio [ ' quant_type ' ] = gr . Dropdown ( label = " quant_type " , choices = [ " nf4 " , " fp4 " ] , value = shared . args . quant_type )
shared . gradio [ ' n_gpu_layers ' ] = gr . Slider ( label = " n-gpu-layers " , minimum = 0 , maximum = 128 , value = shared . args . n_gpu_layers )
shared . gradio [ ' n_ctx ' ] = gr . Slider ( minimum = 0 , maximum = 16384 , step = 256 , label = " n_ctx " , value = shared . args . n_ctx )
shared . gradio [ ' threads ' ] = gr . Slider ( label = " threads " , minimum = 0 , step = 1 , maximum = 32 , value = shared . args . threads )
shared . gradio [ ' n_batch ' ] = gr . Slider ( label = " n_batch " , minimum = 1 , maximum = 2048 , value = shared . args . n_batch )
shared . gradio [ ' wbits ' ] = gr . Dropdown ( label = " wbits " , choices = [ " None " , 1 , 2 , 3 , 4 , 8 ] , value = str ( shared . args . wbits ) if shared . args . wbits > 0 else " None " )
shared . gradio [ ' groupsize ' ] = gr . Dropdown ( label = " groupsize " , choices = [ " None " , 32 , 64 , 128 , 1024 ] , value = str ( shared . args . groupsize ) if shared . args . groupsize > 0 else " None " )
2023-08-11 13:41:33 -04:00
shared . gradio [ ' model_type ' ] = gr . Dropdown ( label = " model_type " , choices = [ " None " ] , value = shared . args . model_type or " None " )
2023-08-06 20:49:27 -04:00
shared . gradio [ ' pre_layer ' ] = gr . Slider ( label = " pre_layer " , minimum = 0 , maximum = 100 , value = shared . args . pre_layer [ 0 ] if shared . args . pre_layer is not None else 0 )
shared . gradio [ ' autogptq_info ' ] = gr . Markdown ( ' * ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA. ' )
shared . gradio [ ' gpu_split ' ] = gr . Textbox ( label = ' gpu-split ' , info = ' Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7 ' )
shared . gradio [ ' max_seq_len ' ] = gr . Slider ( label = ' max_seq_len ' , minimum = 0 , maximum = 16384 , step = 256 , info = ' Maximum sequence length. ' , value = shared . args . max_seq_len )
2023-08-09 00:24:28 -04:00
shared . gradio [ ' alpha_value ' ] = gr . Slider ( label = ' alpha_value ' , minimum = 1 , maximum = 8 , step = 0.1 , info = ' Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. ' , value = shared . args . alpha_value )
2023-08-25 10:06:57 -04:00
shared . gradio [ ' rope_freq_base ' ] = gr . Slider ( label = ' rope_freq_base ' , minimum = 0 , maximum = 1000000 , step = 1000 , info = ' If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63) ' , value = shared . args . rope_freq_base )
2023-08-25 09:53:37 -04:00
shared . gradio [ ' compress_pos_emb ' ] = gr . Slider ( label = ' compress_pos_emb ' , minimum = 1 , maximum = 8 , step = 1 , info = ' Positional embeddings compression factor. Should be set to (context length) / (model \' s original context length). Equal to 1/rope_freq_scale. ' , value = shared . args . compress_pos_emb )
2023-08-06 20:49:27 -04:00
with gr . Column ( ) :
shared . gradio [ ' triton ' ] = gr . Checkbox ( label = " triton " , value = shared . args . triton )
shared . gradio [ ' no_inject_fused_attention ' ] = gr . Checkbox ( label = " no_inject_fused_attention " , value = shared . args . no_inject_fused_attention , info = ' Disable fused attention. Fused attention improves inference performance but uses more VRAM. Disable if running low on VRAM. ' )
shared . gradio [ ' no_inject_fused_mlp ' ] = gr . Checkbox ( label = " no_inject_fused_mlp " , value = shared . args . no_inject_fused_mlp , info = ' Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM. ' )
shared . gradio [ ' no_use_cuda_fp16 ' ] = gr . Checkbox ( label = " no_use_cuda_fp16 " , value = shared . args . no_use_cuda_fp16 , info = ' This can make models faster on some systems. ' )
shared . gradio [ ' desc_act ' ] = gr . Checkbox ( label = " desc_act " , value = shared . args . desc_act , info = ' \' desc_act \' , \' wbits \' , and \' groupsize \' are used for old models without a quantize_config.json. ' )
2023-08-12 02:26:58 -04:00
shared . gradio [ ' disable_exllama ' ] = gr . Checkbox ( label = " disable_exllama " , value = shared . args . disable_exllama , info = ' Disable ExLlama kernel, which can improve inference speed on some systems. ' )
2023-08-06 20:49:27 -04:00
shared . gradio [ ' cpu ' ] = gr . Checkbox ( label = " cpu " , value = shared . args . cpu )
shared . gradio [ ' load_in_8bit ' ] = gr . Checkbox ( label = " load-in-8bit " , value = shared . args . load_in_8bit )
shared . gradio [ ' bf16 ' ] = gr . Checkbox ( label = " bf16 " , value = shared . args . bf16 )
shared . gradio [ ' auto_devices ' ] = gr . Checkbox ( label = " auto-devices " , value = shared . args . auto_devices )
shared . gradio [ ' disk ' ] = gr . Checkbox ( label = " disk " , value = shared . args . disk )
shared . gradio [ ' load_in_4bit ' ] = gr . Checkbox ( label = " load-in-4bit " , value = shared . args . load_in_4bit )
shared . gradio [ ' use_double_quant ' ] = gr . Checkbox ( label = " use_double_quant " , value = shared . args . use_double_quant )
shared . gradio [ ' no_mmap ' ] = gr . Checkbox ( label = " no-mmap " , value = shared . args . no_mmap )
shared . gradio [ ' low_vram ' ] = gr . Checkbox ( label = " low-vram " , value = shared . args . low_vram )
shared . gradio [ ' mlock ' ] = gr . Checkbox ( label = " mlock " , value = shared . args . mlock )
2023-08-18 11:03:34 -04:00
shared . gradio [ ' mul_mat_q ' ] = gr . Checkbox ( label = " mul_mat_q " , value = shared . args . mul_mat_q )
2023-08-24 15:27:36 -04:00
shared . gradio [ ' cfg_cache ' ] = gr . Checkbox ( label = " cfg-cache " , value = shared . args . cfg_cache , info = ' Create an additional cache for CFG negative prompts. ' )
2023-08-18 11:03:34 -04:00
shared . gradio [ ' tensor_split ' ] = gr . Textbox ( label = ' tensor_split ' , info = ' Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17 ' )
2023-08-06 20:49:27 -04:00
shared . gradio [ ' llama_cpp_seed ' ] = gr . Number ( label = ' Seed (0 for random) ' , value = shared . args . llama_cpp_seed )
shared . gradio [ ' trust_remote_code ' ] = gr . Checkbox ( label = " trust-remote-code " , value = shared . args . trust_remote_code , info = ' Make sure to inspect the .py files inside the model folder before loading it with this option enabled. ' )
2023-08-10 01:53:48 -04:00
shared . gradio [ ' gptq_for_llama_info ' ] = gr . Markdown ( ' GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1). ' )
2023-08-06 20:49:27 -04:00
shared . gradio [ ' exllama_info ' ] = gr . Markdown ( ' For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md). ' )
shared . gradio [ ' exllama_HF_info ' ] = gr . Markdown ( ' ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It \' s a bit slower than the regular ExLlama. ' )
2023-09-15 11:38:38 -04:00
shared . gradio [ ' llamacpp_HF_info ' ] = gr . Markdown ( ' llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer. \n \n Option 1: download `oobabooga/llama-tokenizer` under " Download model or LoRA " . That \' s a default Llama tokenizer. \n \n Option 2: place your .gguf in a subfolder of models/ along with these 3 files: tokenizer.model, tokenizer_config.json, and special_tokens_map.json. This takes precedence over Option 1. ' )
2023-08-06 20:49:27 -04:00
with gr . Column ( ) :
with gr . Row ( ) :
shared . gradio [ ' autoload_model ' ] = gr . Checkbox ( value = shared . settings [ ' autoload_model ' ] , label = ' Autoload the model ' , info = ' Whether to load the model as soon as it is selected in the Model dropdown. ' )
2023-08-29 22:32:36 -04:00
shared . gradio [ ' custom_model_menu ' ] = gr . Textbox ( label = " Download model or LoRA " , info = " Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \" : \" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box. " )
2023-09-11 10:30:56 -04:00
shared . gradio [ ' download_specific_file ' ] = gr . Textbox ( placeholder = " File name (for GGUF models) " , show_label = False , max_lines = 1 )
2023-08-29 22:32:36 -04:00
with gr . Row ( ) :
shared . gradio [ ' download_model_button ' ] = gr . Button ( " Download " , variant = ' primary ' )
shared . gradio [ ' get_file_list ' ] = gr . Button ( " Get file list " )
2023-08-06 20:49:27 -04:00
with gr . Row ( ) :
shared . gradio [ ' model_status ' ] = gr . Markdown ( ' No model is loaded ' if shared . model_name == ' None ' else ' Ready ' )
def create_event_handlers ( ) :
2023-08-11 13:41:33 -04:00
shared . gradio [ ' loader ' ] . change (
loaders . make_loader_params_visible , gradio ( ' loader ' ) , gradio ( loaders . get_all_params ( ) ) ) . then (
lambda value : gr . update ( choices = loaders . get_model_types ( value ) ) , gradio ( ' loader ' ) , gradio ( ' model_type ' ) )
2023-08-06 20:49:27 -04:00
# In this event handler, the interface state is read and updated
# with the model defaults (if any), and then the model is loaded
# unless "autoload_model" is unchecked
shared . gradio [ ' model_menu ' ] . change (
ui . gather_interface_values , gradio ( shared . input_elements ) , gradio ( ' interface_state ' ) ) . then (
apply_model_settings_to_state , gradio ( ' model_menu ' , ' interface_state ' ) , gradio ( ' interface_state ' ) ) . then (
ui . apply_interface_values , gradio ( ' interface_state ' ) , gradio ( ui . list_interface_input_elements ( ) ) , show_progress = False ) . then (
update_model_parameters , gradio ( ' interface_state ' ) , None ) . then (
2023-08-27 02:10:45 -04:00
load_model_wrapper , gradio ( ' model_menu ' , ' loader ' , ' autoload_model ' ) , gradio ( ' model_status ' ) , show_progress = False ) . success (
update_truncation_length , gradio ( ' truncation_length ' , ' interface_state ' ) , gradio ( ' truncation_length ' ) )
2023-08-06 20:49:27 -04:00
shared . gradio [ ' load_model ' ] . click (
ui . gather_interface_values , gradio ( shared . input_elements ) , gradio ( ' interface_state ' ) ) . then (
update_model_parameters , gradio ( ' interface_state ' ) , None ) . then (
2023-08-27 02:10:45 -04:00
partial ( load_model_wrapper , autoload = True ) , gradio ( ' model_menu ' , ' loader ' ) , gradio ( ' model_status ' ) , show_progress = False ) . success (
update_truncation_length , gradio ( ' truncation_length ' , ' interface_state ' ) , gradio ( ' truncation_length ' ) )
2023-08-06 20:49:27 -04:00
shared . gradio [ ' unload_model ' ] . click (
unload_model , None , None ) . then (
lambda : " Model unloaded " , None , gradio ( ' model_status ' ) )
shared . gradio [ ' reload_model ' ] . click (
unload_model , None , None ) . then (
ui . gather_interface_values , gradio ( shared . input_elements ) , gradio ( ' interface_state ' ) ) . then (
update_model_parameters , gradio ( ' interface_state ' ) , None ) . then (
2023-08-27 02:10:45 -04:00
partial ( load_model_wrapper , autoload = True ) , gradio ( ' model_menu ' , ' loader ' ) , gradio ( ' model_status ' ) , show_progress = False ) . success (
update_truncation_length , gradio ( ' truncation_length ' , ' interface_state ' ) , gradio ( ' truncation_length ' ) )
2023-08-06 20:49:27 -04:00
shared . gradio [ ' save_model_settings ' ] . click (
ui . gather_interface_values , gradio ( shared . input_elements ) , gradio ( ' interface_state ' ) ) . then (
save_model_settings , gradio ( ' model_menu ' , ' interface_state ' ) , gradio ( ' model_status ' ) , show_progress = False )
shared . gradio [ ' lora_menu_apply ' ] . click ( load_lora_wrapper , gradio ( ' lora_menu ' ) , gradio ( ' model_status ' ) , show_progress = False )
2023-08-29 22:32:36 -04:00
shared . gradio [ ' download_model_button ' ] . click ( download_model_wrapper , gradio ( ' custom_model_menu ' , ' download_specific_file ' ) , gradio ( ' model_status ' ) , show_progress = True )
shared . gradio [ ' get_file_list ' ] . click ( partial ( download_model_wrapper , return_links = True ) , gradio ( ' custom_model_menu ' , ' download_specific_file ' ) , gradio ( ' model_status ' ) , show_progress = True )
2023-08-06 20:49:27 -04:00
shared . gradio [ ' autoload_model ' ] . change ( lambda x : gr . update ( visible = not x ) , gradio ( ' autoload_model ' ) , gradio ( ' load_model ' ) )
def load_model_wrapper ( selected_model , loader , autoload = False ) :
if not autoload :
2023-09-02 14:29:12 -04:00
yield f " The settings for ` { selected_model } ` have been updated. \n \n Click on \" Load \" to load it. "
2023-08-06 20:49:27 -04:00
return
if selected_model == ' None ' :
yield " No model selected "
else :
try :
2023-09-02 14:29:12 -04:00
yield f " Loading ` { selected_model } `... "
2023-08-06 20:49:27 -04:00
shared . model_name = selected_model
unload_model ( )
if selected_model != ' ' :
shared . model , shared . tokenizer = load_model ( shared . model_name , loader )
if shared . model is not None :
2023-09-02 14:29:12 -04:00
output = f " Successfully loaded ` { selected_model } `. "
2023-09-11 17:49:30 -04:00
settings = get_model_metadata ( selected_model )
2023-09-02 14:29:12 -04:00
if ' instruction_template ' in settings :
output + = ' \n \n It seems to be an instruction-following model with template " {} " . In the chat tab, instruct or chat-instruct modes should be used. ' . format ( settings [ ' instruction_template ' ] )
yield output
2023-08-06 20:49:27 -04:00
else :
2023-09-02 14:29:12 -04:00
yield f " Failed to load ` { selected_model } `. "
2023-08-06 20:49:27 -04:00
except :
exc = traceback . format_exc ( )
logger . error ( ' Failed to load the model. ' )
print ( exc )
yield exc . replace ( ' \n ' , ' \n \n ' )
def load_lora_wrapper ( selected_loras ) :
yield ( " Applying the following LoRAs to {} : \n \n {} " . format ( shared . model_name , ' \n ' . join ( selected_loras ) ) )
add_lora_to_model ( selected_loras )
yield ( " Successfuly applied the LoRAs " )
2023-08-29 22:32:36 -04:00
def download_model_wrapper ( repo_id , specific_file , progress = gr . Progress ( ) , return_links = False ) :
2023-08-06 20:49:27 -04:00
try :
downloader_module = importlib . import_module ( " download-model " )
downloader = downloader_module . ModelDownloader ( )
repo_id_parts = repo_id . split ( " : " )
model = repo_id_parts [ 0 ] if len ( repo_id_parts ) > 0 else repo_id
branch = repo_id_parts [ 1 ] if len ( repo_id_parts ) > 1 else " main "
check = False
progress ( 0.0 )
yield ( " Cleaning up the model/branch names " )
model , branch = downloader . sanitize_model_and_branch_names ( model , branch )
yield ( " Getting the download links from Hugging Face " )
2023-08-29 22:32:36 -04:00
links , sha256 , is_lora , is_llamacpp = downloader . get_download_links_from_huggingface ( model , branch , text_only = False , specific_file = specific_file )
if return_links :
yield ' \n \n ' . join ( [ f " ` { Path ( link ) . name } ` " for link in links ] )
return
2023-08-06 20:49:27 -04:00
yield ( " Getting the output folder " )
base_folder = shared . args . lora_dir if is_lora else shared . args . model_dir
2023-08-29 22:32:36 -04:00
output_folder = downloader . get_output_folder ( model , branch , is_lora , is_llamacpp = is_llamacpp , base_folder = base_folder )
2023-08-06 20:49:27 -04:00
if check :
progress ( 0.5 )
yield ( " Checking previously downloaded files " )
downloader . check_model_files ( model , branch , links , sha256 , output_folder )
progress ( 1.0 )
else :
2023-08-29 22:32:36 -04:00
yield ( f " Downloading file { ' s ' if len ( links ) > 1 else ' ' } to ` { output_folder } /` " )
downloader . download_model_files ( model , branch , links , sha256 , output_folder , progress_bar = progress , threads = 1 , is_llamacpp = is_llamacpp )
2023-08-06 20:49:27 -04:00
yield ( " Done! " )
except :
progress ( 1.0 )
yield traceback . format_exc ( ) . replace ( ' \n ' , ' \n \n ' )
2023-08-27 02:10:45 -04:00
def update_truncation_length ( current_length , state ) :
if state [ ' loader ' ] in [ ' ExLlama ' , ' ExLlama_HF ' ] :
return state [ ' max_seq_len ' ]
elif state [ ' loader ' ] in [ ' llama.cpp ' , ' llamacpp_HF ' , ' ctransformers ' ] :
return state [ ' n_ctx ' ]
else :
return current_length