mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-10-01 01:26:03 -04:00
Update README
This commit is contained in:
parent
7e73058943
commit
5c5e7264ec
44
README.md
44
README.md
@ -204,16 +204,16 @@ List of command-line flags
|
|||||||
usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS]
|
usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS]
|
||||||
[--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--chat-buttons] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--auto-devices]
|
[--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--chat-buttons] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--auto-devices]
|
||||||
[--gpu-memory GPU_MEMORY [GPU_MEMORY ...]] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code]
|
[--gpu-memory GPU_MEMORY [GPU_MEMORY ...]] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code]
|
||||||
[--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn]
|
[--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--use_eager_attention] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE]
|
||||||
[--tensorcores] [--n_ctx N_CTX] [--threads THREADS] [--threads-batch THREADS_BATCH] [--no_mul_mat_q] [--n_batch N_BATCH] [--no-mmap] [--mlock] [--n-gpu-layers N_GPU_LAYERS]
|
[--flash-attn] [--tensorcores] [--n_ctx N_CTX] [--threads THREADS] [--threads-batch THREADS_BATCH] [--no_mul_mat_q] [--n_batch N_BATCH] [--no-mmap] [--mlock]
|
||||||
[--tensor_split TENSOR_SPLIT] [--numa] [--logits_all] [--no_offload_kqv] [--cache-capacity CACHE_CAPACITY] [--row_split] [--streaming-llm] [--attention-sink-size ATTENTION_SINK_SIZE]
|
[--n-gpu-layers N_GPU_LAYERS] [--tensor_split TENSOR_SPLIT] [--numa] [--logits_all] [--no_offload_kqv] [--cache-capacity CACHE_CAPACITY] [--row_split] [--streaming-llm]
|
||||||
[--gpu-split GPU_SPLIT] [--autosplit] [--max_seq_len MAX_SEQ_LEN] [--cfg-cache] [--no_flash_attn] [--cache_8bit] [--cache_4bit] [--num_experts_per_token NUM_EXPERTS_PER_TOKEN]
|
[--attention-sink-size ATTENTION_SINK_SIZE] [--gpu-split GPU_SPLIT] [--autosplit] [--max_seq_len MAX_SEQ_LEN] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa]
|
||||||
[--triton] [--no_inject_fused_mlp] [--no_use_cuda_fp16] [--desc_act] [--disable_exllama] [--disable_exllamav2] [--wbits WBITS] [--groupsize GROUPSIZE] [--no_inject_fused_attention]
|
[--cache_8bit] [--cache_4bit] [--num_experts_per_token NUM_EXPERTS_PER_TOKEN] [--triton] [--no_inject_fused_mlp] [--no_use_cuda_fp16] [--desc_act] [--disable_exllama]
|
||||||
[--hqq-backend HQQ_BACKEND] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE]
|
[--disable_exllamav2] [--wbits WBITS] [--groupsize GROUPSIZE] [--no_inject_fused_attention] [--hqq-backend HQQ_BACKEND] [--cpp-runner] [--deepspeed]
|
||||||
[--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH]
|
[--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen]
|
||||||
[--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT]
|
[--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE]
|
||||||
[--api-key API_KEY] [--admin-key ADMIN_KEY] [--nowebui] [--multimodal-pipeline MULTIMODAL_PIPELINE] [--model_type MODEL_TYPE] [--pre_layer PRE_LAYER [PRE_LAYER ...]]
|
[--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--nowebui]
|
||||||
[--checkpoint CHECKPOINT] [--monkey-patch]
|
[--multimodal-pipeline MULTIMODAL_PIPELINE] [--model_type MODEL_TYPE] [--pre_layer PRE_LAYER [PRE_LAYER ...]] [--checkpoint CHECKPOINT] [--monkey-patch]
|
||||||
|
|
||||||
Text generation web UI
|
Text generation web UI
|
||||||
|
|
||||||
@ -254,6 +254,7 @@ Transformers/Accelerate:
|
|||||||
--force-safetensors Set use_safetensors=True while loading the model. This prevents arbitrary code execution.
|
--force-safetensors Set use_safetensors=True while loading the model. This prevents arbitrary code execution.
|
||||||
--no_use_fast Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast.
|
--no_use_fast Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast.
|
||||||
--use_flash_attention_2 Set use_flash_attention_2=True while loading the model.
|
--use_flash_attention_2 Set use_flash_attention_2=True while loading the model.
|
||||||
|
--use_eager_attention Set attn_implementation= eager while loading the model.
|
||||||
|
|
||||||
bitsandbytes 4-bit:
|
bitsandbytes 4-bit:
|
||||||
--load-in-4bit Load the model with 4-bit precision (using bitsandbytes).
|
--load-in-4bit Load the model with 4-bit precision (using bitsandbytes).
|
||||||
@ -263,7 +264,7 @@ bitsandbytes 4-bit:
|
|||||||
|
|
||||||
llama.cpp:
|
llama.cpp:
|
||||||
--flash-attn Use flash-attention.
|
--flash-attn Use flash-attention.
|
||||||
--tensorcores Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.
|
--tensorcores NVIDIA only: use llama-cpp-python compiled with tensor cores support. This may increase performance on newer cards.
|
||||||
--n_ctx N_CTX Size of the prompt context.
|
--n_ctx N_CTX Size of the prompt context.
|
||||||
--threads THREADS Number of threads to use.
|
--threads THREADS Number of threads to use.
|
||||||
--threads-batch THREADS_BATCH Number of threads to use for batches/prompt processing.
|
--threads-batch THREADS_BATCH Number of threads to use for batches/prompt processing.
|
||||||
@ -272,7 +273,7 @@ llama.cpp:
|
|||||||
--no-mmap Prevent mmap from being used.
|
--no-mmap Prevent mmap from being used.
|
||||||
--mlock Force the system to keep the model in RAM.
|
--mlock Force the system to keep the model in RAM.
|
||||||
--n-gpu-layers N_GPU_LAYERS Number of layers to offload to the GPU.
|
--n-gpu-layers N_GPU_LAYERS Number of layers to offload to the GPU.
|
||||||
--tensor_split TENSOR_SPLIT Split the model across multiple GPUs. Comma-separated list of proportions. Example: 18,17.
|
--tensor_split TENSOR_SPLIT Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.
|
||||||
--numa Activate NUMA task allocation for llama.cpp.
|
--numa Activate NUMA task allocation for llama.cpp.
|
||||||
--logits_all Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.
|
--logits_all Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.
|
||||||
--no_offload_kqv Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
|
--no_offload_kqv Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
|
||||||
@ -287,6 +288,8 @@ ExLlamaV2:
|
|||||||
--max_seq_len MAX_SEQ_LEN Maximum sequence length.
|
--max_seq_len MAX_SEQ_LEN Maximum sequence length.
|
||||||
--cfg-cache ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.
|
--cfg-cache ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.
|
||||||
--no_flash_attn Force flash-attention to not be used.
|
--no_flash_attn Force flash-attention to not be used.
|
||||||
|
--no_xformers Force xformers to not be used.
|
||||||
|
--no_sdpa Force Torch SDPA to not be used.
|
||||||
--cache_8bit Use 8-bit cache to save VRAM.
|
--cache_8bit Use 8-bit cache to save VRAM.
|
||||||
--cache_4bit Use Q4 cache to save VRAM.
|
--cache_4bit Use Q4 cache to save VRAM.
|
||||||
--num_experts_per_token NUM_EXPERTS_PER_TOKEN Number of experts to use for generation. Applies to MoE models like Mixtral.
|
--num_experts_per_token NUM_EXPERTS_PER_TOKEN Number of experts to use for generation. Applies to MoE models like Mixtral.
|
||||||
@ -307,6 +310,9 @@ AutoAWQ:
|
|||||||
HQQ:
|
HQQ:
|
||||||
--hqq-backend HQQ_BACKEND Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.
|
--hqq-backend HQQ_BACKEND Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.
|
||||||
|
|
||||||
|
TensorRT-LLM:
|
||||||
|
--cpp-runner Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
|
||||||
|
|
||||||
DeepSpeed:
|
DeepSpeed:
|
||||||
--deepspeed Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
|
--deepspeed Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
|
||||||
--nvme-offload-dir NVME_OFFLOAD_DIR DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
|
--nvme-offload-dir NVME_OFFLOAD_DIR DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
|
||||||
@ -327,6 +333,7 @@ Gradio:
|
|||||||
--gradio-auth-path GRADIO_AUTH_PATH Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above.
|
--gradio-auth-path GRADIO_AUTH_PATH Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above.
|
||||||
--ssl-keyfile SSL_KEYFILE The path to the SSL certificate key file.
|
--ssl-keyfile SSL_KEYFILE The path to the SSL certificate key file.
|
||||||
--ssl-certfile SSL_CERTFILE The path to the SSL certificate cert file.
|
--ssl-certfile SSL_CERTFILE The path to the SSL certificate cert file.
|
||||||
|
--subpath SUBPATH Customize the subpath for gradio, use with reverse proxy
|
||||||
|
|
||||||
API:
|
API:
|
||||||
--api Enable the API extension.
|
--api Enable the API extension.
|
||||||
@ -392,18 +399,11 @@ Run `python download-model.py --help` to see all the options.
|
|||||||
|
|
||||||
https://colab.research.google.com/github/oobabooga/text-generation-webui/blob/main/Colab-TextGen-GPU.ipynb
|
https://colab.research.google.com/github/oobabooga/text-generation-webui/blob/main/Colab-TextGen-GPU.ipynb
|
||||||
|
|
||||||
## Acknowledgment
|
## Community
|
||||||
|
|
||||||
In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition.
|
|
||||||
|
|
||||||
## Links
|
|
||||||
|
|
||||||
#### Community
|
|
||||||
|
|
||||||
* Subreddit: https://www.reddit.com/r/oobabooga/
|
* Subreddit: https://www.reddit.com/r/oobabooga/
|
||||||
* Discord: https://discord.gg/jwZCF2dPQN
|
* Discord: https://discord.gg/jwZCF2dPQN
|
||||||
|
|
||||||
#### Support
|
## Acknowledgment
|
||||||
|
|
||||||
* ko-fi: https://ko-fi.com/oobabooga
|
In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition.
|
||||||
* GitHub Sponsors: https://github.com/sponsors/oobabooga
|
|
||||||
|
Loading…
Reference in New Issue
Block a user