shared.gradio['n_ctx']=gr.Slider(minimum=0,maximum=shared.settings['truncation_length_max'],step=256,label="n_ctx",value=shared.args.n_ctx,info='Context length. Try lowering this if you run out of memory while loading the model.')
shared.gradio['max_seq_len']=gr.Slider(label='max_seq_len',minimum=0,maximum=shared.settings['truncation_length_max'],step=256,info='Context length. Try lowering this if you run out of memory while loading the model.',value=shared.args.max_seq_len)
shared.gradio['alpha_value']=gr.Slider(label='alpha_value',minimum=1,maximum=8,step=0.05,info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.',value=shared.args.alpha_value)
shared.gradio['rope_freq_base']=gr.Slider(label='rope_freq_base',minimum=0,maximum=1000000,step=1000,info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)',value=shared.args.rope_freq_base)
shared.gradio['compress_pos_emb']=gr.Slider(label='compress_pos_emb',minimum=1,maximum=8,step=1,info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.',value=shared.args.compress_pos_emb)
shared.gradio['tensorcores']=gr.Checkbox(label="tensorcores",value=shared.args.tensorcores,info='Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.')
shared.gradio['no_inject_fused_attention']=gr.Checkbox(label="no_inject_fused_attention",value=shared.args.no_inject_fused_attention,info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.')
shared.gradio['no_inject_fused_mlp']=gr.Checkbox(label="no_inject_fused_mlp",value=shared.args.no_inject_fused_mlp,info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
shared.gradio['no_use_cuda_fp16']=gr.Checkbox(label="no_use_cuda_fp16",value=shared.args.no_use_cuda_fp16,info='This can make models faster on some systems.')
shared.gradio['desc_act']=gr.Checkbox(label="desc_act",value=shared.args.desc_act,info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
shared.gradio['no_offload_kqv']=gr.Checkbox(label="no_offload_kqv",value=shared.args.no_offload_kqv,info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
shared.gradio['tensor_split']=gr.Textbox(label='tensor_split',info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
shared.gradio['trust_remote_code']=gr.Checkbox(label="trust-remote-code",value=shared.args.trust_remote_code,info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.',interactive=shared.args.trust_remote_code)
shared.gradio['logits_all']=gr.Checkbox(label="logits_all",value=shared.args.logits_all,info='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
shared.gradio['use_flash_attention_2']=gr.Checkbox(label="use_flash_attention_2",value=shared.args.use_flash_attention_2,info='Set use_flash_attention_2=True while loading the model.')
shared.gradio['num_experts_per_token']=gr.Number(label="Number of experts per token",value=shared.args.num_experts_per_token,info='Only applies to MoE models like Mixtral.')
shared.gradio['gptq_for_llama_info']=gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlama_HF or AutoGPTQ are preferred for GPTQ models when supported.')
shared.gradio['exllama_info']=gr.Markdown("ExLlama_HF is recommended over ExLlama for better integration with extensions and more consistent sampling behavior across loaders.")
shared.gradio['exllamav2_info']=gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
shared.gradio['llamacpp_HF_info']=gr.Markdown('llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer.\n\nOption 1 (recommended): place your .gguf in a subfolder of models/ along with these 4 files: special_tokens_map.json, tokenizer_config.json, tokenizer.json, tokenizer.model.\n\nOption 2: download `oobabooga/llama-tokenizer` under "Download model or LoRA". That\'s a default Llama tokenizer that will work for some (but not all) models.')
shared.gradio['autoload_model']=gr.Checkbox(value=shared.settings['autoload_model'],label='Autoload the model',info='Whether to load the model as soon as it is selected in the Model dropdown.',interactive=notmu)
shared.gradio['custom_model_menu']=gr.Textbox(label="Download model or LoRA",info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.",interactive=notmu)
shared.gradio['download_specific_file']=gr.Textbox(placeholder="File name (for GGUF models)",show_label=False,max_lines=1,interactive=notmu)
output+='\n\nIt seems to be an instruction-following model with template "{}". In the chat tab, instruct or chat-instruct modes should be used.'.format(settings['instruction_template'])