add n_batch support for llama.cpp (#1115)

2024-10-01 01:26:03 -04:00 · 2023-04-24 02:46:18 -04:00 · 2023-04-24 02:46:18 -04:00 · 78d1977ebf
commit 78d1977ebf
parent 2f6e2ddeac
3 changed files with 4 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -220,6 +220,7 @@ Optionally, you can use the following command-line flags:
 | Flag        | Description |
 |-------------|-------------|
 | `--threads` | Number of threads to use in llama.cpp. |
+| `--n_batch` | Processing batch size for llama.cpp. |

 #### GPTQ

--- a/modules/llamacpp_model_alternative.py
+++ b/modules/llamacpp_model_alternative.py
@ -24,7 +24,8 @@ class LlamaCppModel:
            'model_path': str(path),
            'n_ctx': 2048,
            'seed': 0,
-            'n_threads': shared.args.threads or None
+            'n_threads': shared.args.threads or None,
+            'n_batch': shared.args.n_batch
        }
        self.model = Llama(**params)
        self.model.set_cache(LlamaCache)
--- a/modules/shared.py
+++ b/modules/shared.py
@ -119,6 +119,7 @@ parser.add_argument('--trust-remote-code', action='store_true', help="Set trust_

 # llama.cpp
 parser.add_argument('--threads', type=int, default=0, help='Number of threads to use in llama.cpp.')
+parser.add_argument('--n_batch', type=int, default=8, help='Processing batch size for llama.cpp.')

 # GPTQ
 parser.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')