parser.add_argument('--multi-user',action='store_true',help='Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly.')
parser.add_argument('--lora',type=str,nargs='+',help='The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.')
parser.add_argument('--model-dir',type=str,default='models/',help='Path to directory with all the models.')
parser.add_argument('--lora-dir',type=str,default='loras/',help='Path to directory with all the loras.')
parser.add_argument('--settings',type=str,help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
parser.add_argument('--extensions',type=str,nargs='+',help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
parser.add_argument('--loader',type=str,help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, exllama_hf, exllamav2_hf, exllama, exllamav2, autogptq, gptq-for-llama, llama.cpp, llamacpp_hf, ctransformers, autoawq.')
parser.add_argument('--gpu-memory',type=str,nargs='+',help='Maximum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB.')
parser.add_argument('--disk',action='store_true',help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
parser.add_argument('--no-cache',action='store_true',help='Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.')
parser.add_argument('--xformers',action='store_true',help='Use xformer\'s memory efficient attention. This is really old and probably doesn\'t do anything.')
parser.add_argument('--sdp-attention',action='store_true',help='Use PyTorch 2.0\'s SDP attention. Same as above.')
parser.add_argument('--trust-remote-code',action='store_true',help='Set trust_remote_code=True while loading the model. Necessary for some models.')
parser.add_argument('--force-safetensors',action='store_true',help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
parser.add_argument('--no_use_fast',action='store_true',help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.')
parser.add_argument('--tensor_split',type=str,default=None,help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 18,17.')
parser.add_argument('--numa',action='store_true',help='Activate NUMA task allocation for llama.cpp.')
parser.add_argument('--logits_all',action='store_true',help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
parser.add_argument('--cache-capacity',type=str,help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')
parser.add_argument('--cfg-cache',action='store_true',help='ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama.')
parser.add_argument('--no_inject_fused_attention',action='store_true',help='Disable the use of fused attention, which will use less VRAM at the cost of slower inference.')
parser.add_argument('--no_inject_fused_mlp',action='store_true',help='Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference.')
parser.add_argument('--desc_act',action='store_true',help='For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
parser.add_argument('--pre_layer',type=int,nargs='+',help='The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg --pre_layer 30 60.')
parser.add_argument('--checkpoint',type=str,help='The path to the quantized checkpoint file. If not specified, it will be automatically detected.')
parser.add_argument('--monkey-patch',action='store_true',help='Apply the monkey patch for using LoRAs with quantized models.')
parser.add_argument('--rwkv-strategy',type=str,default=None,help='RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8".')
parser.add_argument('--rwkv-cuda-on',action='store_true',help='RWKV: Compile the CUDA kernel for better performance.')
parser.add_argument('--alpha_value',type=float,default=1,help='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.')
parser.add_argument('--rope_freq_base',type=int,default=0,help='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).')
parser.add_argument('--compress_pos_emb',type=int,default=1,help="Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.")
parser.add_argument('--gradio-auth',type=str,help='Set Gradio authentication password in the format "username:password". Multiple credentials can also be supplied with "u1:p1,u2:p2,u3:p3".',default=None)
parser.add_argument('--gradio-auth-path',type=str,help='Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above.',default=None)
parser.add_argument('--ssl-keyfile',type=str,help='The path to the SSL certificate key file.',default=None)
parser.add_argument('--ssl-certfile',type=str,help='The path to the SSL certificate cert file.',default=None)
parser.add_argument('--admin-key',type=str,default='',help='API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.')
logger.warning("\nYou are potentially exposing the web UI to the entire internet without any access password.\nYou can create one with the \"--gradio-auth\" flag like this:\n\n--gradio-auth username:password\n\nMake sure to replace username:password with your own.")
ifargs.multi_user:
logger.warning('\nThe multi-user mode is highly experimental and should not be shared publicly.')