From e6f44d6d192a4be8bed953a76fbd0130d04ca48b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 15 Nov 2023 16:00:51 -0800
Subject: [PATCH] Print context length / instruction template to terminal when
 loading models

---
 extensions/openai/completions.py | 7 +------
 modules/models.py                | 8 ++++++++
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 40a4cab6..6fd533b0 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -78,12 +78,7 @@ def process_parameters(body, is_legacy=False):
     max_tokens_str = 'length' if is_legacy else 'max_tokens'
     generate_params['max_new_tokens'] = body.pop(max_tokens_str)
     if generate_params['truncation_length'] == 0:
-        if shared.args.loader and shared.args.loader.lower().startswith('exllama'):
-            generate_params['truncation_length'] = shared.args.max_seq_len
-        elif shared.args.loader and shared.args.loader in ['llama.cpp', 'llamacpp_HF', 'ctransformers']:
-            generate_params['truncation_length'] = shared.args.n_ctx
-        else:
-            generate_params['truncation_length'] = shared.settings['truncation_length']
+        generate_params['truncation_length'] = shared.settings['truncation_length']
 
     if body['preset'] is not None:
         preset = load_preset_memoized(body['preset'])
diff --git a/modules/models.py b/modules/models.py
index e58d5770..70e14361 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -97,6 +97,13 @@ def load_model(model_name, loader=None):
         llama_attn_hijack.hijack_llama_attention()
 
     shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
+    if loader.lower().startswith('exllama'):
+        shared.settings['truncation_length'] = shared.args.max_seq_len
+    elif loader in ['llama.cpp', 'llamacpp_HF', 'ctransformers']:
+        shared.settings['truncation_length'] = shared.args.n_ctx
+
+    logger.info(f"CONTEXT LENGTH: {shared.settings['truncation_length']}")
+    logger.info(f"INSTRUCTION TEMPLATE: {shared.settings['instruction_template']}")
     logger.info(f"Loaded the model in {(time.time()-t0):.2f} seconds.")
     return model, tokenizer
 
@@ -395,6 +402,7 @@ def get_max_memory_dict():
             total_mem = (torch.xpu.get_device_properties(0).total_memory / (1024 * 1024))
         else:
             total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024 * 1024))
+
         suggestion = round((total_mem - 1000) / 1000) * 1000
         if total_mem - suggestion < 800:
             suggestion -= 1000