2023-08-06 20:49:27 -04:00
import importlib
import math
import re
import traceback
from functools import partial
2023-08-29 22:32:36 -04:00
from pathlib import Path
2023-08-06 20:49:27 -04:00
import gradio as gr
import psutil
import torch
2023-10-26 22:39:51 -04:00
from transformers import is_torch_xpu_available
2023-08-06 20:49:27 -04:00
from modules import loaders , shared , ui , utils
from modules . logging_colors import logger
from modules . LoRA import add_lora_to_model
from modules . models import load_model , unload_model
from modules . models_settings import (
apply_model_settings_to_state ,
2023-09-11 17:49:30 -04:00
get_model_metadata ,
2023-08-06 20:49:27 -04:00
save_model_settings ,
update_model_parameters
)
from modules . utils import gradio
def create_ui ( ) :
2023-09-26 08:44:04 -04:00
mu = shared . args . multi_user
2023-08-06 20:49:27 -04:00
# Finding the default values for the GPU and CPU memories
total_mem = [ ]
2023-10-26 22:39:51 -04:00
if is_torch_xpu_available ( ) :
for i in range ( torch . xpu . device_count ( ) ) :
total_mem . append ( math . floor ( torch . xpu . get_device_properties ( i ) . total_memory / ( 1024 * 1024 ) ) )
else :
for i in range ( torch . cuda . device_count ( ) ) :
total_mem . append ( math . floor ( torch . cuda . get_device_properties ( i ) . total_memory / ( 1024 * 1024 ) ) )
2023-08-06 20:49:27 -04:00
default_gpu_mem = [ ]
if shared . args . gpu_memory is not None and len ( shared . args . gpu_memory ) > 0 :
for i in shared . args . gpu_memory :
if ' mib ' in i . lower ( ) :
default_gpu_mem . append ( int ( re . sub ( ' [a-zA-Z ] ' , ' ' , i ) ) )
else :
default_gpu_mem . append ( int ( re . sub ( ' [a-zA-Z ] ' , ' ' , i ) ) * 1000 )
2023-08-18 11:03:34 -04:00
2023-08-06 20:49:27 -04:00
while len ( default_gpu_mem ) < len ( total_mem ) :
default_gpu_mem . append ( 0 )
total_cpu_mem = math . floor ( psutil . virtual_memory ( ) . total / ( 1024 * 1024 ) )
if shared . args . cpu_memory is not None :
default_cpu_mem = re . sub ( ' [a-zA-Z ] ' , ' ' , shared . args . cpu_memory )
else :
default_cpu_mem = 0
with gr . Tab ( " Model " , elem_id = " model-tab " ) :
with gr . Row ( ) :
with gr . Column ( ) :
with gr . Row ( ) :
with gr . Column ( ) :
with gr . Row ( ) :
2023-12-11 00:51:01 -05:00
shared . gradio [ ' model_menu ' ] = gr . Dropdown ( choices = utils . get_available_models ( ) , value = lambda : shared . model_name , label = ' Model ' , elem_classes = ' slim-dropdown ' , interactive = not mu )
2023-09-26 08:44:04 -04:00
ui . create_refresh_button ( shared . gradio [ ' model_menu ' ] , lambda : None , lambda : { ' choices ' : utils . get_available_models ( ) } , ' refresh-button ' , interactive = not mu )
shared . gradio [ ' load_model ' ] = gr . Button ( " Load " , visible = not shared . settings [ ' autoload_model ' ] , elem_classes = ' refresh-button ' , interactive = not mu )
shared . gradio [ ' unload_model ' ] = gr . Button ( " Unload " , elem_classes = ' refresh-button ' , interactive = not mu )
shared . gradio [ ' reload_model ' ] = gr . Button ( " Reload " , elem_classes = ' refresh-button ' , interactive = not mu )
shared . gradio [ ' save_model_settings ' ] = gr . Button ( " Save settings " , elem_classes = ' refresh-button ' , interactive = not mu )
2023-08-06 20:49:27 -04:00
with gr . Column ( ) :
with gr . Row ( ) :
2023-09-26 08:44:04 -04:00
shared . gradio [ ' lora_menu ' ] = gr . Dropdown ( multiselect = True , choices = utils . get_available_loras ( ) , value = shared . lora_names , label = ' LoRA(s) ' , elem_classes = ' slim-dropdown ' , interactive = not mu )
ui . create_refresh_button ( shared . gradio [ ' lora_menu ' ] , lambda : None , lambda : { ' choices ' : utils . get_available_loras ( ) , ' value ' : shared . lora_names } , ' refresh-button ' , interactive = not mu )
shared . gradio [ ' lora_menu_apply ' ] = gr . Button ( value = ' Apply LoRAs ' , elem_classes = ' refresh-button ' , interactive = not mu )
2023-08-06 20:49:27 -04:00
with gr . Row ( ) :
with gr . Column ( ) :
2023-08-11 13:41:33 -04:00
shared . gradio [ ' loader ' ] = gr . Dropdown ( label = " Model loader " , choices = loaders . loaders_and_params . keys ( ) , value = None )
2023-08-06 20:49:27 -04:00
with gr . Box ( ) :
with gr . Row ( ) :
with gr . Column ( ) :
for i in range ( len ( total_mem ) ) :
shared . gradio [ f ' gpu_memory_ { i } ' ] = gr . Slider ( label = f " gpu-memory in MiB for device : { i } " , maximum = total_mem [ i ] , value = default_gpu_mem [ i ] )
shared . gradio [ ' cpu_memory ' ] = gr . Slider ( label = " cpu-memory in MiB " , maximum = total_cpu_mem , value = default_cpu_mem )
shared . gradio [ ' transformers_info ' ] = gr . Markdown ( ' load-in-4bit params: ' )
shared . gradio [ ' compute_dtype ' ] = gr . Dropdown ( label = " compute_dtype " , choices = [ " bfloat16 " , " float16 " , " float32 " ] , value = shared . args . compute_dtype )
shared . gradio [ ' quant_type ' ] = gr . Dropdown ( label = " quant_type " , choices = [ " nf4 " , " fp4 " ] , value = shared . args . quant_type )
2023-12-18 19:23:16 -05:00
shared . gradio [ ' hqq_backend ' ] = gr . Dropdown ( label = " hqq_backend " , choices = [ " PYTORCH " , " PYTORCH_COMPILE " , " ATEN " ] , value = shared . args . hqq_backend )
2023-08-06 20:49:27 -04:00
2024-01-17 15:13:16 -05:00
shared . gradio [ ' n_gpu_layers ' ] = gr . Slider ( label = " n-gpu-layers " , minimum = 0 , maximum = 256 , value = shared . args . n_gpu_layers )
2023-11-15 21:56:42 -05:00
shared . gradio [ ' n_ctx ' ] = gr . Slider ( minimum = 0 , maximum = shared . settings [ ' truncation_length_max ' ] , step = 256 , label = " n_ctx " , value = shared . args . n_ctx , info = ' Context length. Try lowering this if you run out of memory while loading the model. ' )
2023-08-06 20:49:27 -04:00
shared . gradio [ ' threads ' ] = gr . Slider ( label = " threads " , minimum = 0 , step = 1 , maximum = 32 , value = shared . args . threads )
2023-10-02 00:27:04 -04:00
shared . gradio [ ' threads_batch ' ] = gr . Slider ( label = " threads_batch " , minimum = 0 , step = 1 , maximum = 32 , value = shared . args . threads_batch )
2024-02-04 16:15:30 -05:00
shared . gradio [ ' n_batch ' ] = gr . Slider ( label = " n_batch " , minimum = 1 , maximum = 2048 , step = 1 , value = shared . args . n_batch )
2023-08-06 20:49:27 -04:00
2023-10-10 23:53:38 -04:00
shared . gradio [ ' wbits ' ] = gr . Dropdown ( label = " wbits " , choices = [ " None " , 1 , 2 , 3 , 4 , 8 ] , value = shared . args . wbits if shared . args . wbits > 0 else " None " )
shared . gradio [ ' groupsize ' ] = gr . Dropdown ( label = " groupsize " , choices = [ " None " , 32 , 64 , 128 , 1024 ] , value = shared . args . groupsize if shared . args . groupsize > 0 else " None " )
2023-08-11 13:41:33 -04:00
shared . gradio [ ' model_type ' ] = gr . Dropdown ( label = " model_type " , choices = [ " None " ] , value = shared . args . model_type or " None " )
2023-08-06 20:49:27 -04:00
shared . gradio [ ' pre_layer ' ] = gr . Slider ( label = " pre_layer " , minimum = 0 , maximum = 100 , value = shared . args . pre_layer [ 0 ] if shared . args . pre_layer is not None else 0 )
2023-12-31 00:31:17 -05:00
shared . gradio [ ' autogptq_info ' ] = gr . Markdown ( ' ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama. ' )
2023-08-06 20:49:27 -04:00
shared . gradio [ ' gpu_split ' ] = gr . Textbox ( label = ' gpu-split ' , info = ' Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7 ' )
2023-11-15 21:56:42 -05:00
shared . gradio [ ' max_seq_len ' ] = gr . Slider ( label = ' max_seq_len ' , minimum = 0 , maximum = shared . settings [ ' truncation_length_max ' ] , step = 256 , info = ' Context length. Try lowering this if you run out of memory while loading the model. ' , value = shared . args . max_seq_len )
2023-09-29 16:48:38 -04:00
shared . gradio [ ' alpha_value ' ] = gr . Slider ( label = ' alpha_value ' , minimum = 1 , maximum = 8 , step = 0.05 , info = ' Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both. ' , value = shared . args . alpha_value )
2023-08-25 10:06:57 -04:00
shared . gradio [ ' rope_freq_base ' ] = gr . Slider ( label = ' rope_freq_base ' , minimum = 0 , maximum = 1000000 , step = 1000 , info = ' If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63) ' , value = shared . args . rope_freq_base )
2023-08-25 09:53:37 -04:00
shared . gradio [ ' compress_pos_emb ' ] = gr . Slider ( label = ' compress_pos_emb ' , minimum = 1 , maximum = 8 , step = 1 , info = ' Positional embeddings compression factor. Should be set to (context length) / (model \' s original context length). Equal to 1/rope_freq_scale. ' , value = shared . args . compress_pos_emb )
2023-12-31 00:31:17 -05:00
shared . gradio [ ' quipsharp_info ' ] = gr . Markdown ( ' QuIP# has to be installed manually at the moment. ' )
2023-08-06 20:49:27 -04:00
with gr . Column ( ) :
2023-12-19 15:30:53 -05:00
shared . gradio [ ' tensorcores ' ] = gr . Checkbox ( label = " tensorcores " , value = shared . args . tensorcores , info = ' Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only. ' )
2023-12-19 16:18:26 -05:00
shared . gradio [ ' no_offload_kqv ' ] = gr . Checkbox ( label = " no_offload_kqv " , value = shared . args . no_offload_kqv , info = ' Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance. ' )
2024-02-04 21:36:40 -05:00
shared . gradio [ ' row_split ' ] = gr . Checkbox ( label = " row_split " , value = shared . args . row_split , info = ' Split model by rows across GPUs. Improves performance on some cards. ' )
2023-08-06 20:49:27 -04:00
shared . gradio [ ' triton ' ] = gr . Checkbox ( label = " triton " , value = shared . args . triton )
2023-10-05 12:19:18 -04:00
shared . gradio [ ' no_inject_fused_attention ' ] = gr . Checkbox ( label = " no_inject_fused_attention " , value = shared . args . no_inject_fused_attention , info = ' Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM. ' )
2023-08-06 20:49:27 -04:00
shared . gradio [ ' no_inject_fused_mlp ' ] = gr . Checkbox ( label = " no_inject_fused_mlp " , value = shared . args . no_inject_fused_mlp , info = ' Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM. ' )
shared . gradio [ ' no_use_cuda_fp16 ' ] = gr . Checkbox ( label = " no_use_cuda_fp16 " , value = shared . args . no_use_cuda_fp16 , info = ' This can make models faster on some systems. ' )
shared . gradio [ ' desc_act ' ] = gr . Checkbox ( label = " desc_act " , value = shared . args . desc_act , info = ' \' desc_act \' , \' wbits \' , and \' groupsize \' are used for old models without a quantize_config.json. ' )
2023-10-22 15:22:06 -04:00
shared . gradio [ ' no_mul_mat_q ' ] = gr . Checkbox ( label = " no_mul_mat_q " , value = shared . args . no_mul_mat_q , info = ' Disable the mulmat kernels. ' )
2023-09-26 21:05:00 -04:00
shared . gradio [ ' no_mmap ' ] = gr . Checkbox ( label = " no-mmap " , value = shared . args . no_mmap )
shared . gradio [ ' mlock ' ] = gr . Checkbox ( label = " mlock " , value = shared . args . mlock )
shared . gradio [ ' numa ' ] = gr . Checkbox ( label = " numa " , value = shared . args . numa , info = ' NUMA support can help on some systems with non-uniform memory access. ' )
2023-08-06 20:49:27 -04:00
shared . gradio [ ' cpu ' ] = gr . Checkbox ( label = " cpu " , value = shared . args . cpu )
shared . gradio [ ' load_in_8bit ' ] = gr . Checkbox ( label = " load-in-8bit " , value = shared . args . load_in_8bit )
shared . gradio [ ' bf16 ' ] = gr . Checkbox ( label = " bf16 " , value = shared . args . bf16 )
shared . gradio [ ' auto_devices ' ] = gr . Checkbox ( label = " auto-devices " , value = shared . args . auto_devices )
shared . gradio [ ' disk ' ] = gr . Checkbox ( label = " disk " , value = shared . args . disk )
shared . gradio [ ' load_in_4bit ' ] = gr . Checkbox ( label = " load-in-4bit " , value = shared . args . load_in_4bit )
shared . gradio [ ' use_double_quant ' ] = gr . Checkbox ( label = " use_double_quant " , value = shared . args . use_double_quant )
2023-08-18 11:03:34 -04:00
shared . gradio [ ' tensor_split ' ] = gr . Textbox ( label = ' tensor_split ' , info = ' Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17 ' )
2023-10-23 12:57:44 -04:00
shared . gradio [ ' trust_remote_code ' ] = gr . Checkbox ( label = " trust-remote-code " , value = shared . args . trust_remote_code , info = ' To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models. ' , interactive = shared . args . trust_remote_code )
2023-11-16 22:45:05 -05:00
shared . gradio [ ' cfg_cache ' ] = gr . Checkbox ( label = " cfg-cache " , value = shared . args . cfg_cache , info = ' Create an additional cache for CFG negative prompts. ' )
2023-11-07 17:35:48 -05:00
shared . gradio [ ' logits_all ' ] = gr . Checkbox ( label = " logits_all " , value = shared . args . logits_all , info = ' Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower. ' )
2023-11-04 12:59:33 -04:00
shared . gradio [ ' use_flash_attention_2 ' ] = gr . Checkbox ( label = " use_flash_attention_2 " , value = shared . args . use_flash_attention_2 , info = ' Set use_flash_attention_2=True while loading the model. ' )
2023-09-24 23:03:11 -04:00
shared . gradio [ ' disable_exllama ' ] = gr . Checkbox ( label = " disable_exllama " , value = shared . args . disable_exllama , info = ' Disable ExLlama kernel. ' )
2023-12-15 09:46:13 -05:00
shared . gradio [ ' disable_exllamav2 ' ] = gr . Checkbox ( label = " disable_exllamav2 " , value = shared . args . disable_exllamav2 , info = ' Disable ExLlamav2 kernel. ' )
2023-11-02 14:23:04 -04:00
shared . gradio [ ' no_flash_attn ' ] = gr . Checkbox ( label = " no_flash_attn " , value = shared . args . no_flash_attn , info = ' Force flash-attention to not be used. ' )
shared . gradio [ ' cache_8bit ' ] = gr . Checkbox ( label = " cache_8bit " , value = shared . args . cache_8bit , info = ' Use 8-bit cache to save VRAM. ' )
2023-11-16 22:45:05 -05:00
shared . gradio [ ' no_use_fast ' ] = gr . Checkbox ( label = " no_use_fast " , value = shared . args . no_use_fast , info = ' Set use_fast=False while loading the tokenizer. ' )
2023-12-17 10:08:33 -05:00
shared . gradio [ ' num_experts_per_token ' ] = gr . Number ( label = " Number of experts per token " , value = shared . args . num_experts_per_token , info = ' Only applies to MoE models like Mixtral. ' )
2023-12-30 23:57:06 -05:00
shared . gradio [ ' gptq_for_llama_info ' ] = gr . Markdown ( ' Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported. ' )
2023-12-13 00:04:20 -05:00
shared . gradio [ ' llamacpp_HF_info ' ] = gr . Markdown ( ' llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer. \n \n Option 1 (recommended): place your .gguf in a subfolder of models/ along with these 4 files: special_tokens_map.json, tokenizer_config.json, tokenizer.json, tokenizer.model. \n \n Option 2: download `oobabooga/llama-tokenizer` under " Download model or LoRA " . That \' s a default Llama tokenizer that will work for some (but not all) models. ' )
2023-08-06 20:49:27 -04:00
with gr . Column ( ) :
with gr . Row ( ) :
2023-09-26 08:44:04 -04:00
shared . gradio [ ' autoload_model ' ] = gr . Checkbox ( value = shared . settings [ ' autoload_model ' ] , label = ' Autoload the model ' , info = ' Whether to load the model as soon as it is selected in the Model dropdown. ' , interactive = not mu )
2023-08-06 20:49:27 -04:00
2023-09-26 08:44:04 -04:00
shared . gradio [ ' custom_model_menu ' ] = gr . Textbox ( label = " Download model or LoRA " , info = " Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \" : \" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box. " , interactive = not mu )
shared . gradio [ ' download_specific_file ' ] = gr . Textbox ( placeholder = " File name (for GGUF models) " , show_label = False , max_lines = 1 , interactive = not mu )
2023-08-29 22:32:36 -04:00
with gr . Row ( ) :
2023-09-26 08:44:04 -04:00
shared . gradio [ ' download_model_button ' ] = gr . Button ( " Download " , variant = ' primary ' , interactive = not mu )
shared . gradio [ ' get_file_list ' ] = gr . Button ( " Get file list " , interactive = not mu )
2023-08-06 20:49:27 -04:00
with gr . Row ( ) :
shared . gradio [ ' model_status ' ] = gr . Markdown ( ' No model is loaded ' if shared . model_name == ' None ' else ' Ready ' )
def create_event_handlers ( ) :
2023-08-11 13:41:33 -04:00
shared . gradio [ ' loader ' ] . change (
loaders . make_loader_params_visible , gradio ( ' loader ' ) , gradio ( loaders . get_all_params ( ) ) ) . then (
lambda value : gr . update ( choices = loaders . get_model_types ( value ) ) , gradio ( ' loader ' ) , gradio ( ' model_type ' ) )
2023-08-06 20:49:27 -04:00
# In this event handler, the interface state is read and updated
# with the model defaults (if any), and then the model is loaded
# unless "autoload_model" is unchecked
shared . gradio [ ' model_menu ' ] . change (
ui . gather_interface_values , gradio ( shared . input_elements ) , gradio ( ' interface_state ' ) ) . then (
apply_model_settings_to_state , gradio ( ' model_menu ' , ' interface_state ' ) , gradio ( ' interface_state ' ) ) . then (
ui . apply_interface_values , gradio ( ' interface_state ' ) , gradio ( ui . list_interface_input_elements ( ) ) , show_progress = False ) . then (
update_model_parameters , gradio ( ' interface_state ' ) , None ) . then (
2023-08-27 02:10:45 -04:00
load_model_wrapper , gradio ( ' model_menu ' , ' loader ' , ' autoload_model ' ) , gradio ( ' model_status ' ) , show_progress = False ) . success (
2023-09-25 13:28:35 -04:00
update_truncation_length , gradio ( ' truncation_length ' , ' interface_state ' ) , gradio ( ' truncation_length ' ) ) . then (
lambda x : x , gradio ( ' loader ' ) , gradio ( ' filter_by_loader ' ) )
2023-08-06 20:49:27 -04:00
shared . gradio [ ' load_model ' ] . click (
ui . gather_interface_values , gradio ( shared . input_elements ) , gradio ( ' interface_state ' ) ) . then (
update_model_parameters , gradio ( ' interface_state ' ) , None ) . then (
2023-08-27 02:10:45 -04:00
partial ( load_model_wrapper , autoload = True ) , gradio ( ' model_menu ' , ' loader ' ) , gradio ( ' model_status ' ) , show_progress = False ) . success (
2023-09-25 13:28:35 -04:00
update_truncation_length , gradio ( ' truncation_length ' , ' interface_state ' ) , gradio ( ' truncation_length ' ) ) . then (
lambda x : x , gradio ( ' loader ' ) , gradio ( ' filter_by_loader ' ) )
2023-08-06 20:49:27 -04:00
shared . gradio [ ' reload_model ' ] . click (
unload_model , None , None ) . then (
ui . gather_interface_values , gradio ( shared . input_elements ) , gradio ( ' interface_state ' ) ) . then (
update_model_parameters , gradio ( ' interface_state ' ) , None ) . then (
2023-08-27 02:10:45 -04:00
partial ( load_model_wrapper , autoload = True ) , gradio ( ' model_menu ' , ' loader ' ) , gradio ( ' model_status ' ) , show_progress = False ) . success (
2023-09-25 13:28:35 -04:00
update_truncation_length , gradio ( ' truncation_length ' , ' interface_state ' ) , gradio ( ' truncation_length ' ) ) . then (
lambda x : x , gradio ( ' loader ' ) , gradio ( ' filter_by_loader ' ) )
shared . gradio [ ' unload_model ' ] . click (
unload_model , None , None ) . then (
lambda : " Model unloaded " , None , gradio ( ' model_status ' ) )
2023-08-06 20:49:27 -04:00
shared . gradio [ ' save_model_settings ' ] . click (
ui . gather_interface_values , gradio ( shared . input_elements ) , gradio ( ' interface_state ' ) ) . then (
save_model_settings , gradio ( ' model_menu ' , ' interface_state ' ) , gradio ( ' model_status ' ) , show_progress = False )
shared . gradio [ ' lora_menu_apply ' ] . click ( load_lora_wrapper , gradio ( ' lora_menu ' ) , gradio ( ' model_status ' ) , show_progress = False )
2023-08-29 22:32:36 -04:00
shared . gradio [ ' download_model_button ' ] . click ( download_model_wrapper , gradio ( ' custom_model_menu ' , ' download_specific_file ' ) , gradio ( ' model_status ' ) , show_progress = True )
shared . gradio [ ' get_file_list ' ] . click ( partial ( download_model_wrapper , return_links = True ) , gradio ( ' custom_model_menu ' , ' download_specific_file ' ) , gradio ( ' model_status ' ) , show_progress = True )
2023-08-06 20:49:27 -04:00
shared . gradio [ ' autoload_model ' ] . change ( lambda x : gr . update ( visible = not x ) , gradio ( ' autoload_model ' ) , gradio ( ' load_model ' ) )
def load_model_wrapper ( selected_model , loader , autoload = False ) :
if not autoload :
2023-09-02 14:29:12 -04:00
yield f " The settings for ` { selected_model } ` have been updated. \n \n Click on \" Load \" to load it. "
2023-08-06 20:49:27 -04:00
return
if selected_model == ' None ' :
yield " No model selected "
else :
try :
2023-09-02 14:29:12 -04:00
yield f " Loading ` { selected_model } `... "
2023-08-06 20:49:27 -04:00
unload_model ( )
if selected_model != ' ' :
2023-12-08 09:35:23 -05:00
shared . model , shared . tokenizer = load_model ( selected_model , loader )
2023-08-06 20:49:27 -04:00
if shared . model is not None :
2023-09-02 14:29:12 -04:00
output = f " Successfully loaded ` { selected_model } `. "
2023-09-11 17:49:30 -04:00
settings = get_model_metadata ( selected_model )
2023-09-02 14:29:12 -04:00
if ' instruction_template ' in settings :
output + = ' \n \n It seems to be an instruction-following model with template " {} " . In the chat tab, instruct or chat-instruct modes should be used. ' . format ( settings [ ' instruction_template ' ] )
yield output
2023-08-06 20:49:27 -04:00
else :
2023-09-02 14:29:12 -04:00
yield f " Failed to load ` { selected_model } `. "
2023-08-06 20:49:27 -04:00
except :
exc = traceback . format_exc ( )
logger . error ( ' Failed to load the model. ' )
print ( exc )
yield exc . replace ( ' \n ' , ' \n \n ' )
def load_lora_wrapper ( selected_loras ) :
yield ( " Applying the following LoRAs to {} : \n \n {} " . format ( shared . model_name , ' \n ' . join ( selected_loras ) ) )
add_lora_to_model ( selected_loras )
yield ( " Successfuly applied the LoRAs " )
2023-09-16 09:06:13 -04:00
def download_model_wrapper ( repo_id , specific_file , progress = gr . Progress ( ) , return_links = False , check = False ) :
2023-08-06 20:49:27 -04:00
try :
progress ( 0.0 )
2023-10-10 16:52:10 -04:00
downloader = importlib . import_module ( " download-model " ) . ModelDownloader ( )
2023-09-16 09:06:13 -04:00
model , branch = downloader . sanitize_model_and_branch_names ( repo_id , None )
2023-08-06 20:49:27 -04:00
yield ( " Getting the download links from Hugging Face " )
2023-08-29 22:32:36 -04:00
links , sha256 , is_lora , is_llamacpp = downloader . get_download_links_from_huggingface ( model , branch , text_only = False , specific_file = specific_file )
if return_links :
yield ' \n \n ' . join ( [ f " ` { Path ( link ) . name } ` " for link in links ] )
return
2023-08-06 20:49:27 -04:00
yield ( " Getting the output folder " )
base_folder = shared . args . lora_dir if is_lora else shared . args . model_dir
2023-08-29 22:32:36 -04:00
output_folder = downloader . get_output_folder ( model , branch , is_lora , is_llamacpp = is_llamacpp , base_folder = base_folder )
2023-08-06 20:49:27 -04:00
if check :
progress ( 0.5 )
yield ( " Checking previously downloaded files " )
downloader . check_model_files ( model , branch , links , sha256 , output_folder )
progress ( 1.0 )
else :
2023-08-29 22:32:36 -04:00
yield ( f " Downloading file { ' s ' if len ( links ) > 1 else ' ' } to ` { output_folder } /` " )
2023-10-10 16:52:10 -04:00
downloader . download_model_files ( model , branch , links , sha256 , output_folder , progress_bar = progress , threads = 4 , is_llamacpp = is_llamacpp )
2023-08-06 20:49:27 -04:00
yield ( " Done! " )
except :
progress ( 1.0 )
yield traceback . format_exc ( ) . replace ( ' \n ' , ' \n \n ' )
2023-08-27 02:10:45 -04:00
def update_truncation_length ( current_length , state ) :
2023-09-29 09:14:16 -04:00
if ' loader ' in state :
if state [ ' loader ' ] . lower ( ) . startswith ( ' exllama ' ) :
return state [ ' max_seq_len ' ]
elif state [ ' loader ' ] in [ ' llama.cpp ' , ' llamacpp_HF ' , ' ctransformers ' ] :
return state [ ' n_ctx ' ]
return current_length