mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-10-01 01:26:03 -04:00
Implement sending layers to disk with --disk (#10)
This commit is contained in:
parent
1ce95ee817
commit
7ace04864a
@ -130,8 +130,9 @@ Optionally, you can use the following command-line flags:
|
||||
| `--chat` | Launch the web UI in chat mode.|
|
||||
| `--cai-chat` | Launch the web UI in chat mode with a style similar to Character.AI's. If the file profile.png or profile.jpg exists in the same folder as server.py, this image will be used as the bot's profile picture. |
|
||||
| `--cpu` | Use the CPU to generate text.|
|
||||
| `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
|
||||
| `--load-in-8bit` | Load the model with 8-bit precision.|
|
||||
| `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
|
||||
| `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
|
||||
| `--max-gpu-memory MAX_GPU_MEMORY` | Maximum memory in GiB to allocate to the GPU when loading the model. This is useful if you get out of memory errors while trying to generate text. Must be an integer number. |
|
||||
| `--no-listen` | Make the web UI unreachable from your local network.|
|
||||
| `--no-stream` | Don't stream the text output in real time. This slightly improves the text generation performance.|
|
||||
|
@ -21,8 +21,9 @@ parser.add_argument('--notebook', action='store_true', help='Launch the web UI i
|
||||
parser.add_argument('--chat', action='store_true', help='Launch the web UI in chat mode.')
|
||||
parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI in chat mode with a style similar to Character.AI\'s. If the file profile.png or profile.jpg exists in the same folder as server.py, this image will be used as the bot\'s profile picture.')
|
||||
parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
|
||||
parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
|
||||
parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
|
||||
parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
|
||||
parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
|
||||
parser.add_argument('--max-gpu-memory', type=int, help='Maximum memory in GiB to allocate to the GPU when loading the model. This is useful if you get out of memory errors while trying to generate text. Must be an integer number.')
|
||||
parser.add_argument('--no-listen', action='store_true', help='Make the web UI unreachable from your local network.')
|
||||
parser.add_argument('--no-stream', action='store_true', help='Don\'t stream the text output in real time. This slightly improves the text generation performance.')
|
||||
@ -81,6 +82,8 @@ def load_model(model_name):
|
||||
settings.append("device_map='auto'")
|
||||
if args.max_gpu_memory is not None:
|
||||
settings.append(f"max_memory={{0: '{args.max_gpu_memory}GiB', 'cpu': '99GiB'}}")
|
||||
if args.disk:
|
||||
settings.append("offload_folder='cache'")
|
||||
if args.load_in_8bit:
|
||||
settings.append("load_in_8bit=True")
|
||||
else:
|
||||
@ -167,6 +170,7 @@ def generate_reply(question, tokens, inference_settings, selected_model, eos_tok
|
||||
|
||||
# Generate the reply 1 token at a time
|
||||
else:
|
||||
yield formatted_outputs(question, model_name)
|
||||
input_ids = encode(question, 1)
|
||||
preset = preset.replace('max_new_tokens=tokens', 'max_new_tokens=1')
|
||||
for i in range(tokens):
|
||||
|
Loading…
Reference in New Issue
Block a user