2023-02-23 10:05:25 -05:00
import argparse
2023-09-19 16:11:46 -04:00
import sys
2023-05-12 05:09:45 -04:00
from collections import OrderedDict
2023-04-14 10:07:28 -04:00
from pathlib import Path
import yaml
2023-02-23 10:05:25 -05:00
2023-05-21 21:42:34 -04:00
from modules . logging_colors import logger
2023-08-06 21:58:59 -04:00
# Model variables
2023-02-23 10:05:25 -05:00
model = None
tokenizer = None
2023-03-16 20:31:39 -04:00
model_name = " None "
2023-08-06 21:58:59 -04:00
is_seq2seq = False
2023-07-12 14:29:43 -04:00
model_dirty_from_training = False
2023-08-06 21:58:59 -04:00
lora_names = [ ]
2023-02-23 11:42:23 -05:00
2023-08-06 21:58:59 -04:00
# Generation variables
2023-02-23 13:26:41 -05:00
stop_everything = False
2023-08-06 21:58:59 -04:00
generation_lock = None
2023-03-13 21:28:00 -04:00
processing_message = ' *Is typing...* '
2023-02-23 13:11:18 -05:00
2023-08-06 21:58:59 -04:00
# UI variables
2023-02-24 14:46:50 -05:00
gradio = { }
2023-04-24 02:05:47 -04:00
persistent_interface_state = { }
2023-03-15 22:29:56 -04:00
need_restart = False
2023-08-03 00:13:16 -04:00
2023-08-06 21:58:59 -04:00
# UI defaults
2023-02-23 11:42:23 -05:00
settings = {
2023-07-25 23:11:57 -04:00
' dark_theme ' : True ,
2023-08-16 10:03:53 -04:00
' show_controls ' : True ,
2023-08-14 10:46:07 -04:00
' start_with ' : ' ' ,
' mode ' : ' chat ' ,
2023-09-11 22:30:23 -04:00
' chat_style ' : ' cai-chat ' ,
2023-08-14 10:46:07 -04:00
' prompt-default ' : ' QA ' ,
' prompt-notebook ' : ' QA ' ,
' preset ' : ' simple-1 ' ,
2023-02-23 11:42:23 -05:00
' max_new_tokens ' : 200 ,
' max_new_tokens_min ' : 1 ,
2023-07-17 20:08:22 -04:00
' max_new_tokens_max ' : 4096 ,
2023-03-31 11:22:07 -04:00
' seed ' : - 1 ,
2023-08-06 16:22:48 -04:00
' negative_prompt ' : ' ' ,
2023-08-14 10:46:07 -04:00
' truncation_length ' : 2048 ,
' truncation_length_min ' : 0 ,
2023-09-28 22:28:22 -04:00
' truncation_length_max ' : 32768 ,
2023-08-14 10:46:07 -04:00
' custom_stopping_strings ' : ' ' ,
' auto_max_new_tokens ' : False ,
2023-08-29 16:44:31 -04:00
' max_tokens_second ' : 0 ,
2023-08-14 10:46:07 -04:00
' ban_eos_token ' : False ,
2023-09-15 17:27:27 -04:00
' custom_token_bans ' : ' ' ,
2023-08-14 10:46:07 -04:00
' add_bos_token ' : True ,
' skip_special_tokens ' : True ,
' stream ' : True ,
2023-03-23 12:36:00 -04:00
' name1 ' : ' You ' ,
2023-09-21 16:19:32 -04:00
' character ' : ' Assistant ' ,
2023-08-29 16:06:25 -04:00
' instruction_template ' : ' Alpaca ' ,
2023-05-14 09:43:55 -04:00
' chat-instruct_command ' : ' Continue the chat dialogue below. Write a single reply for the character " <|character|> " . \n \n <|prompt|> ' ,
2023-08-14 10:46:07 -04:00
' autoload_model ' : False ,
2023-08-13 00:12:15 -04:00
' default_extensions ' : [ ' gallery ' ] ,
2023-02-23 11:42:23 -05:00
}
2023-02-23 10:05:25 -05:00
2023-04-06 23:15:45 -04:00
2023-03-03 23:04:02 -05:00
def str2bool ( v ) :
if isinstance ( v , bool ) :
return v
if v . lower ( ) in ( ' yes ' , ' true ' , ' t ' , ' y ' , ' 1 ' ) :
return True
elif v . lower ( ) in ( ' no ' , ' false ' , ' f ' , ' n ' , ' 0 ' ) :
return False
else :
raise argparse . ArgumentTypeError ( ' Boolean value expected. ' )
2023-04-06 23:15:45 -04:00
parser = argparse . ArgumentParser ( formatter_class = lambda prog : argparse . HelpFormatter ( prog , max_help_position = 54 ) )
2023-03-31 20:18:05 -04:00
# Basic settings
2023-08-13 00:12:15 -04:00
parser . add_argument ( ' --notebook ' , action = ' store_true ' , help = ' DEPRECATED ' )
parser . add_argument ( ' --chat ' , action = ' store_true ' , help = ' DEPRECATED ' )
2023-07-03 23:03:30 -04:00
parser . add_argument ( ' --multi-user ' , action = ' store_true ' , help = ' Multi-user mode. Chat histories are not saved or automatically loaded. WARNING: this is highly experimental. ' )
2023-04-24 12:19:42 -04:00
parser . add_argument ( ' --character ' , type = str , help = ' The name of the character to load in chat mode by default. ' )
2023-03-31 20:18:05 -04:00
parser . add_argument ( ' --model ' , type = str , help = ' Name of the model to load by default. ' )
2023-04-25 21:58:48 -04:00
parser . add_argument ( ' --lora ' , type = str , nargs = " + " , help = ' The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces. ' )
2023-03-31 20:18:05 -04:00
parser . add_argument ( " --model-dir " , type = str , default = ' models/ ' , help = " Path to directory with all the models " )
parser . add_argument ( " --lora-dir " , type = str , default = ' loras/ ' , help = " Path to directory with all the loras " )
2023-04-12 20:24:26 -04:00
parser . add_argument ( ' --model-menu ' , action = ' store_true ' , help = ' Show a model menu in the terminal when the web UI is first launched. ' )
2023-08-14 10:46:07 -04:00
parser . add_argument ( ' --no-stream ' , action = ' store_true ' , help = ' DEPRECATED ' )
2023-05-28 21:34:12 -04:00
parser . add_argument ( ' --settings ' , type = str , help = ' Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag. ' )
2023-03-31 20:18:05 -04:00
parser . add_argument ( ' --extensions ' , type = str , nargs = " + " , help = ' The list of extensions to load. If you want to load more than one extension, write the names separated by spaces. ' )
parser . add_argument ( ' --verbose ' , action = ' store_true ' , help = ' Print the prompts to the terminal. ' )
2023-09-15 23:39:37 -04:00
parser . add_argument ( ' --chat-buttons ' , action = ' store_true ' , help = ' Show buttons on chat tab instead of hover menu. ' )
2023-03-31 20:18:05 -04:00
2023-06-16 18:00:37 -04:00
# Model loader
2023-07-25 18:15:29 -04:00
parser . add_argument ( ' --loader ' , type = str , help = ' Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv ' )
2023-06-16 18:00:37 -04:00
2023-03-31 20:18:05 -04:00
# Accelerate/transformers
2023-04-10 16:29:00 -04:00
parser . add_argument ( ' --cpu ' , action = ' store_true ' , help = ' Use the CPU to generate text. Warning: Training on CPU is extremely slow. ' )
2023-03-31 20:18:05 -04:00
parser . add_argument ( ' --auto-devices ' , action = ' store_true ' , help = ' Automatically split the model across the available GPU(s) and CPU. ' )
2023-06-01 11:08:39 -04:00
parser . add_argument ( ' --gpu-memory ' , type = str , nargs = " + " , help = ' Maximum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB. ' )
2023-04-01 12:56:47 -04:00
parser . add_argument ( ' --cpu-memory ' , type = str , help = ' Maximum CPU memory in GiB to allocate for offloaded weights. Same as above. ' )
2023-03-31 20:18:05 -04:00
parser . add_argument ( ' --disk ' , action = ' store_true ' , help = ' If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. ' )
parser . add_argument ( ' --disk-cache-dir ' , type = str , default = " cache " , help = ' Directory to save the disk cache to. Defaults to " cache " . ' )
2023-05-25 00:14:13 -04:00
parser . add_argument ( ' --load-in-8bit ' , action = ' store_true ' , help = ' Load the model with 8-bit precision (using bitsandbytes). ' )
2023-03-31 20:18:05 -04:00
parser . add_argument ( ' --bf16 ' , action = ' store_true ' , help = ' Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. ' )
parser . add_argument ( ' --no-cache ' , action = ' store_true ' , help = ' Set use_cache to False while generating text. This reduces the VRAM usage a bit at a performance cost. ' )
2023-04-09 22:08:40 -04:00
parser . add_argument ( ' --xformers ' , action = ' store_true ' , help = " Use xformer ' s memory efficient attention. This should increase your tokens/s. " )
parser . add_argument ( ' --sdp-attention ' , action = ' store_true ' , help = " Use torch 2.0 ' s sdp attention. " )
2023-05-29 09:20:18 -04:00
parser . add_argument ( ' --trust-remote-code ' , action = ' store_true ' , help = " Set trust_remote_code=True while loading a model. Necessary for ChatGLM and Falcon. " )
2023-09-25 15:19:43 -04:00
parser . add_argument ( ' --use_fast ' , action = ' store_true ' , help = " Set use_fast=True while loading a tokenizer. " )
2023-03-25 23:11:33 -04:00
2023-05-25 00:14:13 -04:00
# Accelerate 4-bit
parser . add_argument ( ' --load-in-4bit ' , action = ' store_true ' , help = ' Load the model with 4-bit precision (using bitsandbytes). ' )
2023-05-25 14:05:53 -04:00
parser . add_argument ( ' --compute_dtype ' , type = str , default = " float16 " , help = " compute dtype for 4-bit. Valid options: bfloat16, float16, float32. " )
2023-05-25 00:14:13 -04:00
parser . add_argument ( ' --quant_type ' , type = str , default = " nf4 " , help = ' quant_type for 4-bit. Valid options: nf4, fp4. ' )
parser . add_argument ( ' --use_double_quant ' , action = ' store_true ' , help = ' use_double_quant for 4-bit. ' )
2023-03-31 20:18:05 -04:00
# llama.cpp
2023-05-02 17:25:28 -04:00
parser . add_argument ( ' --threads ' , type = int , default = 0 , help = ' Number of threads to use. ' )
2023-10-02 00:27:04 -04:00
parser . add_argument ( ' --threads-batch ' , type = int , default = 0 , help = ' Number of threads to use for batches/prompt processing. ' )
2023-05-02 17:25:28 -04:00
parser . add_argument ( ' --n_batch ' , type = int , default = 512 , help = ' Maximum number of prompt tokens to batch together when calling llama_eval. ' )
2023-05-03 08:50:31 -04:00
parser . add_argument ( ' --no-mmap ' , action = ' store_true ' , help = ' Prevent mmap from being used. ' )
2023-05-02 17:25:28 -04:00
parser . add_argument ( ' --mlock ' , action = ' store_true ' , help = ' Force the system to keep the model in RAM. ' )
2023-08-18 11:03:34 -04:00
parser . add_argument ( ' --mul_mat_q ' , action = ' store_true ' , help = ' Activate new mulmat kernels. ' )
2023-05-15 19:19:55 -04:00
parser . add_argument ( ' --cache-capacity ' , type = str , help = ' Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. ' )
2023-05-14 21:58:11 -04:00
parser . add_argument ( ' --n-gpu-layers ' , type = int , default = 0 , help = ' Number of layers to offload to the GPU. ' )
2023-08-18 11:03:34 -04:00
parser . add_argument ( ' --tensor_split ' , type = str , default = None , help = " Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17 " )
2023-05-25 09:29:31 -04:00
parser . add_argument ( ' --n_ctx ' , type = int , default = 2048 , help = ' Size of the prompt context. ' )
parser . add_argument ( ' --llama_cpp_seed ' , type = int , default = 0 , help = ' Seed for llama-cpp models. Default 0 (random) ' )
2023-09-26 21:05:00 -04:00
parser . add_argument ( ' --numa ' , action = ' store_true ' , help = ' Activate NUMA task allocation for llama.cpp ' )
2023-03-31 20:18:05 -04:00
# GPTQ
2023-04-17 09:55:35 -04:00
parser . add_argument ( ' --wbits ' , type = int , default = 0 , help = ' Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. ' )
parser . add_argument ( ' --model_type ' , type = str , help = ' Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported. ' )
parser . add_argument ( ' --groupsize ' , type = int , default = - 1 , help = ' Group size. ' )
2023-05-17 09:41:09 -04:00
parser . add_argument ( ' --pre_layer ' , type = int , nargs = " + " , help = ' The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg --pre_layer 30 60. ' )
2023-05-04 14:17:20 -04:00
parser . add_argument ( ' --checkpoint ' , type = str , help = ' The path to the quantized checkpoint file. If not specified, it will be automatically detected. ' )
2023-04-17 09:55:35 -04:00
parser . add_argument ( ' --monkey-patch ' , action = ' store_true ' , help = ' Apply the monkey patch for using LoRAs with quantized models. ' )
2023-03-25 23:11:33 -04:00
2023-05-17 10:12:12 -04:00
# AutoGPTQ
parser . add_argument ( ' --triton ' , action = ' store_true ' , help = ' Use triton. ' )
2023-06-15 22:59:54 -04:00
parser . add_argument ( ' --no_inject_fused_attention ' , action = ' store_true ' , help = ' Do not use fused attention (lowers VRAM requirements). ' )
parser . add_argument ( ' --no_inject_fused_mlp ' , action = ' store_true ' , help = ' Triton mode only: Do not use fused MLP (lowers VRAM requirements). ' )
2023-06-23 11:22:56 -04:00
parser . add_argument ( ' --no_use_cuda_fp16 ' , action = ' store_true ' , help = ' This can make models faster on some systems. ' )
2023-06-02 00:33:55 -04:00
parser . add_argument ( ' --desc_act ' , action = ' store_true ' , help = ' For models that don \' t have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig. ' )
2023-08-12 02:26:58 -04:00
parser . add_argument ( ' --disable_exllama ' , action = ' store_true ' , help = ' Disable ExLlama kernel, which can improve inference speed on some systems. ' )
2023-05-17 10:12:12 -04:00
2023-06-16 19:49:36 -04:00
# ExLlama
parser . add_argument ( ' --gpu-split ' , type = str , help = " Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. 20,7,7 " )
2023-06-25 21:49:26 -04:00
parser . add_argument ( ' --max_seq_len ' , type = int , default = 2048 , help = " Maximum sequence length. " )
2023-08-24 15:27:36 -04:00
parser . add_argument ( ' --cfg-cache ' , action = ' store_true ' , help = " ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama. " )
2023-06-16 19:49:36 -04:00
2023-03-31 20:18:05 -04:00
# DeepSpeed
2023-02-23 10:05:25 -05:00
parser . add_argument ( ' --deepspeed ' , action = ' store_true ' , help = ' Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration. ' )
parser . add_argument ( ' --nvme-offload-dir ' , type = str , help = ' DeepSpeed: Directory to use for ZeRO-3 NVME offloading. ' )
parser . add_argument ( ' --local_rank ' , type = int , default = 0 , help = ' DeepSpeed: Optional argument for distributed setups. ' )
2023-03-31 20:18:05 -04:00
# RWKV
2023-03-06 18:12:54 -05:00
parser . add_argument ( ' --rwkv-strategy ' , type = str , default = None , help = ' RWKV: The strategy to use while loading the model. Examples: " cpu fp32 " , " cuda fp16 " , " cuda fp16i8 " . ' )
parser . add_argument ( ' --rwkv-cuda-on ' , action = ' store_true ' , help = ' RWKV: Compile the CUDA kernel for better performance. ' )
2023-03-31 20:18:05 -04:00
2023-07-25 17:58:13 -04:00
# RoPE
2023-08-27 17:54:43 -04:00
parser . add_argument ( ' --alpha_value ' , type = float , default = 1 , help = " Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. " )
2023-08-25 10:08:38 -04:00
parser . add_argument ( ' --rope_freq_base ' , type = int , default = 0 , help = " If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63). " )
2023-08-25 09:53:37 -04:00
parser . add_argument ( ' --compress_pos_emb ' , type = int , default = 1 , help = " Positional embeddings compression factor. Should be set to (context length) / (model \' s original context length). Equal to 1/rope_freq_scale. " )
2023-07-25 17:58:13 -04:00
2023-03-31 20:18:05 -04:00
# Gradio
2023-02-23 10:05:25 -05:00
parser . add_argument ( ' --listen ' , action = ' store_true ' , help = ' Make the web UI reachable from your local network. ' )
2023-04-13 20:35:08 -04:00
parser . add_argument ( ' --listen-host ' , type = str , help = ' The hostname that the server will use. ' )
2023-02-23 10:05:25 -05:00
parser . add_argument ( ' --listen-port ' , type = int , help = ' The listening port that the server will use. ' )
parser . add_argument ( ' --share ' , action = ' store_true ' , help = ' Create a public URL. This is useful for running the web UI on Google Colab or similar. ' )
2023-03-13 11:44:18 -04:00
parser . add_argument ( ' --auto-launch ' , action = ' store_true ' , default = False , help = ' Open the web UI in the default browser upon launch. ' )
2023-05-23 19:39:26 -04:00
parser . add_argument ( " --gradio-auth " , type = str , help = ' set gradio authentication like " username:password " ; or comma-delimit multiple like " u1:p1,u2:p2,u3:p3 " ' , default = None )
2023-03-27 22:39:26 -04:00
parser . add_argument ( " --gradio-auth-path " , type = str , help = ' Set the gradio authentication file path. The file should contain one or more user:password pairs in this format: " u1:p1,u2:p2,u3:p3 " ' , default = None )
2023-08-04 12:57:31 -04:00
parser . add_argument ( " --ssl-keyfile " , type = str , help = ' The path to the SSL certificate key file. ' , default = None )
parser . add_argument ( " --ssl-certfile " , type = str , help = ' The path to the SSL certificate cert file. ' , default = None )
2023-03-31 20:18:05 -04:00
2023-04-23 14:52:43 -04:00
# API
parser . add_argument ( ' --api ' , action = ' store_true ' , help = ' Enable the API extension. ' )
2023-05-15 19:44:16 -04:00
parser . add_argument ( ' --api-blocking-port ' , type = int , default = 5000 , help = ' The listening port for the blocking API. ' )
2023-07-12 14:33:25 -04:00
parser . add_argument ( ' --api-streaming-port ' , type = int , default = 5005 , help = ' The listening port for the streaming API. ' )
2023-04-23 14:52:43 -04:00
parser . add_argument ( ' --public-api ' , action = ' store_true ' , help = ' Create a public URL for the API using Cloudfare. ' )
2023-08-08 21:20:27 -04:00
parser . add_argument ( ' --public-api-id ' , type = str , help = ' Tunnel ID for named Cloudflare Tunnel. Use together with public-api option. ' , default = None )
2023-04-23 14:52:43 -04:00
2023-05-09 19:18:02 -04:00
# Multimodal
parser . add_argument ( ' --multimodal-pipeline ' , type = str , default = None , help = ' The multimodal pipeline to use. Examples: llava-7b, llava-13b. ' )
2023-04-23 14:52:43 -04:00
2023-02-23 10:05:25 -05:00
args = parser . parse_args ( )
2023-04-14 14:35:06 -04:00
args_defaults = parser . parse_args ( [ ] )
2023-09-19 16:11:46 -04:00
provided_arguments = [ ]
for arg in sys . argv [ 1 : ] :
arg = arg . lstrip ( ' - ' ) . replace ( ' - ' , ' _ ' )
if hasattr ( args , arg ) :
provided_arguments . append ( arg )
2023-03-14 06:56:31 -04:00
2023-08-13 00:12:15 -04:00
# Deprecation warnings
2023-08-14 10:46:07 -04:00
for k in [ ' chat ' , ' notebook ' , ' no_stream ' ] :
2023-08-13 00:12:15 -04:00
if getattr ( args , k ) :
2023-08-14 11:45:58 -04:00
logger . warning ( f ' The -- { k } flag has been deprecated and will be removed soon. Please remove that flag. ' )
2023-08-13 00:12:15 -04:00
2023-04-16 18:15:03 -04:00
# Security warnings
if args . trust_remote_code :
2023-05-21 21:42:34 -04:00
logger . warning ( " trust_remote_code is enabled. This is dangerous. " )
2023-04-17 18:34:28 -04:00
if args . share :
2023-05-28 21:48:20 -04:00
logger . warning ( " The gradio \" share link \" feature uses a proprietary executable to create a reverse tunnel. Use it with care. " )
2023-09-25 23:07:22 -04:00
if any ( ( args . listen , args . share ) ) and not any ( ( args . gradio_auth , args . gradio_auth_path ) ) :
2023-09-26 08:44:04 -04:00
logger . warning ( " \n You are potentially exposing the web UI to the entire internet without any access password. \n You can create one with the \" --gradio-auth \" flag like this: \n \n --gradio-auth username:password \n \n Make sure to replace username:password with your own. " )
if args . multi_user :
logger . warning ( " \n The multi-user mode is highly experimental and should not be shared publicly. " )
2023-04-16 18:15:03 -04:00
2023-05-09 19:18:02 -04:00
2023-06-16 18:00:37 -04:00
def fix_loader_name ( name ) :
2023-08-06 23:36:35 -04:00
if not name :
return name
2023-06-16 18:00:37 -04:00
name = name . lower ( )
if name in [ ' llamacpp ' , ' llama.cpp ' , ' llama-cpp ' , ' llama cpp ' ] :
return ' llama.cpp '
2023-07-16 01:21:13 -04:00
if name in [ ' llamacpp_hf ' , ' llama.cpp_hf ' , ' llama-cpp-hf ' , ' llamacpp-hf ' , ' llama.cpp-hf ' ] :
return ' llamacpp_HF '
2023-06-16 18:00:37 -04:00
elif name in [ ' transformers ' , ' huggingface ' , ' hf ' , ' hugging_face ' , ' hugging face ' ] :
return ' Transformers '
elif name in [ ' autogptq ' , ' auto-gptq ' , ' auto_gptq ' , ' auto gptq ' ] :
return ' AutoGPTQ '
elif name in [ ' gptq-for-llama ' , ' gptqforllama ' , ' gptqllama ' , ' gptq for llama ' , ' gptq_for_llama ' ] :
return ' GPTQ-for-LLaMa '
2023-06-16 19:35:38 -04:00
elif name in [ ' exllama ' , ' ex-llama ' , ' ex_llama ' , ' exlama ' ] :
return ' ExLlama '
2023-06-21 14:31:42 -04:00
elif name in [ ' exllama-hf ' , ' exllama_hf ' , ' exllama hf ' , ' ex-llama-hf ' , ' ex_llama_hf ' ] :
return ' ExLlama_HF '
2023-09-12 18:05:21 -04:00
elif name in [ ' exllamav2 ' , ' exllama-v2 ' , ' ex_llama-v2 ' , ' exlamav2 ' , ' exlama-v2 ' , ' exllama2 ' , ' exllama-2 ' ] :
2023-09-12 13:33:07 -04:00
return ' ExLlamav2 '
2023-09-12 18:05:21 -04:00
elif name in [ ' exllamav2-hf ' , ' exllamav2_hf ' , ' exllama-v2-hf ' , ' exllama_v2_hf ' , ' exllama-v2_hf ' , ' exllama2-hf ' , ' exllama2_hf ' , ' exllama-2-hf ' , ' exllama_2_hf ' , ' exllama-2_hf ' ] :
2023-09-12 13:33:07 -04:00
return ' ExLlamav2_HF '
2023-08-11 13:41:33 -04:00
elif name in [ ' ctransformers ' , ' ctranforemrs ' , ' ctransformer ' ] :
return ' ctransformers '
2023-10-05 12:19:18 -04:00
elif name in [ ' autoawq ' , ' awq ' , ' auto-awq ' ] :
return ' AutoAWQ '
2023-06-16 18:00:37 -04:00
2023-05-09 19:18:02 -04:00
def add_extension ( name ) :
2023-04-23 14:52:43 -04:00
if args . extensions is None :
2023-05-09 19:18:02 -04:00
args . extensions = [ name ]
2023-04-23 14:52:43 -04:00
elif ' api ' not in args . extensions :
2023-05-09 19:18:02 -04:00
args . extensions . append ( name )
2023-04-01 19:14:43 -04:00
def is_chat ( ) :
2023-08-13 00:12:15 -04:00
return True
2023-07-03 23:03:30 -04:00
2023-08-06 21:58:59 -04:00
args . loader = fix_loader_name ( args . loader )
# Activate the API extension
if args . api or args . public_api :
add_extension ( ' api ' )
# Activate the multimodal extension
if args . multimodal_pipeline is not None :
add_extension ( ' multimodal ' )
# Load model-specific settings
2023-04-14 10:07:28 -04:00
with Path ( f ' { args . model_dir } /config.yaml ' ) as p :
if p . exists ( ) :
model_config = yaml . safe_load ( open ( p , ' r ' ) . read ( ) )
else :
model_config = { }
2023-08-06 21:58:59 -04:00
# Load custom model-specific settings
2023-04-14 10:07:28 -04:00
with Path ( f ' { args . model_dir } /config-user.yaml ' ) as p :
if p . exists ( ) :
user_config = yaml . safe_load ( open ( p , ' r ' ) . read ( ) )
2023-09-14 15:15:52 -04:00
else :
user_config = { }
2023-05-12 05:09:45 -04:00
model_config = OrderedDict ( model_config )
2023-09-14 15:15:52 -04:00
user_config = OrderedDict ( user_config )