From f01b9aa71fb85ce59e10019c4e57005e4d66e970 Mon Sep 17 00:00:00 2001
From: saltacc <alotofcatz@gmail.com>
Date: Fri, 15 Sep 2023 14:27:27 -0700
Subject: [PATCH] Add customizable ban tokens (#3899)

---
 api-examples/api-example-chat-stream.py |  1 +
 api-examples/api-example-chat.py        |  1 +
 api-examples/api-example-stream.py      |  1 +
 api-examples/api-example.py             |  1 +
 extensions/api/util.py                  |  1 +
 extensions/openai/defaults.py           |  1 +
 modules/exllama.py                      |  5 +++++
 modules/exllamav2.py                    |  7 ++++++-
 modules/llamacpp_model.py               | 20 +++++++++++++++++---
 modules/loaders.py                      |  9 +++++++++
 modules/presets.py                      |  1 +
 modules/shared.py                       |  1 +
 modules/text_generation.py              |  8 ++++++++
 modules/ui.py                           |  1 +
 modules/ui_parameters.py                |  2 +-
 settings-template.yaml                  |  1 +
 16 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/api-examples/api-example-chat-stream.py b/api-examples/api-example-chat-stream.py
index 5670d4cf..bf4201ca 100644
--- a/api-examples/api-example-chat-stream.py
+++ b/api-examples/api-example-chat-stream.py
@@ -70,6 +70,7 @@ async def run(user_input, history):
         'add_bos_token': True,
         'truncation_length': 2048,
         'ban_eos_token': False,
+        'custom_token_bans': '',
         'skip_special_tokens': True,
         'stopping_strings': []
     }
diff --git a/api-examples/api-example-chat.py b/api-examples/api-example-chat.py
index 26c69b73..42ba0a62 100644
--- a/api-examples/api-example-chat.py
+++ b/api-examples/api-example-chat.py
@@ -64,6 +64,7 @@ def run(user_input, history):
         'add_bos_token': True,
         'truncation_length': 2048,
         'ban_eos_token': False,
+        'custom_token_bans': '',
         'skip_special_tokens': True,
         'stopping_strings': []
     }
diff --git a/api-examples/api-example-stream.py b/api-examples/api-example-stream.py
index c042a50b..53822162 100644
--- a/api-examples/api-example-stream.py
+++ b/api-examples/api-example-stream.py
@@ -53,6 +53,7 @@ async def run(context):
         'add_bos_token': True,
         'truncation_length': 2048,
         'ban_eos_token': False,
+        'custom_token_bans': '',
         'skip_special_tokens': True,
         'stopping_strings': []
     }
diff --git a/api-examples/api-example.py b/api-examples/api-example.py
index 47362754..e6d79f9b 100644
--- a/api-examples/api-example.py
+++ b/api-examples/api-example.py
@@ -45,6 +45,7 @@ def run(prompt):
         'add_bos_token': True,
         'truncation_length': 2048,
         'ban_eos_token': False,
+        'custom_token_bans': '',
         'skip_special_tokens': True,
         'stopping_strings': []
     }
diff --git a/extensions/api/util.py b/extensions/api/util.py
index 6d0cb170..499706ca 100644
--- a/extensions/api/util.py
+++ b/extensions/api/util.py
@@ -49,6 +49,7 @@ def build_parameters(body, chat=False):
         'seed': int(body.get('seed', -1)),
         'add_bos_token': bool(body.get('add_bos_token', True)),
         'truncation_length': int(body.get('truncation_length', body.get('max_context_length', 2048))),
+        'custom_token_bans': str(body.get('custom_token_bans', '')),
         'ban_eos_token': bool(body.get('ban_eos_token', False)),
         'skip_special_tokens': bool(body.get('skip_special_tokens', True)),
         'custom_stopping_strings': '',  # leave this blank
diff --git a/extensions/openai/defaults.py b/extensions/openai/defaults.py
index c6a6adfd..052862f7 100644
--- a/extensions/openai/defaults.py
+++ b/extensions/openai/defaults.py
@@ -37,6 +37,7 @@ default_req_params = {
     'guidance_scale': 1,
     'negative_prompt': '',
     'ban_eos_token': False,
+    'custom_token_bans': '',
     'skip_special_tokens': True,
     'custom_stopping_strings': '',
     # 'logits_processor' - conditionally passed
diff --git a/modules/exllama.py b/modules/exllama.py
index c9ff1228..177f028f 100644
--- a/modules/exllama.py
+++ b/modules/exllama.py
@@ -108,6 +108,11 @@ class ExllamaModel:
         else:
             self.generator.disallow_tokens(None)
 
+        if state['custom_token_bans']:
+            to_ban = [int(x) for x in state['custom_token_bans'].split(',')]
+            if len(to_ban) > 0:
+                self.generator.disallow_tokens(self.tokenizer, to_ban)
+
         # Case 1: no CFG
         if state['guidance_scale'] == 1:
             self.generator.end_beam_search()
diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index 6d4603c5..a325a4d3 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -30,7 +30,7 @@ class Exllamav2Model:
         config.max_seq_len = shared.args.max_seq_len
         config.scale_pos_emb = shared.args.compress_pos_emb
         config.scale_alpha_value = shared.args.alpha_value
-        
+
         model = ExLlamaV2(config)
 
         split = None
@@ -60,6 +60,11 @@ class Exllamav2Model:
         if state['ban_eos_token']:
             settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id])
 
+        if state['custom_token_bans']:
+            to_ban = [int(x) for x in state['custom_token_bans'].split(',')]
+            if len(to_ban) > 0:
+                settings.disallow_tokens(self.tokenizer, to_ban)
+
         ids = self.tokenizer.encode(prompt)
         ids = ids[:, -get_max_prompt_length(state):]
         initial_len = ids.shape[-1]
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index f09ca505..5db6e27e 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -31,6 +31,13 @@ def ban_eos_logits_processor(eos_token, input_ids, logits):
     return logits
 
 
+def custom_token_ban_logits_processor(token_ids, input_ids, logits):
+    for token_id in token_ids:
+        logits[token_id] = -float('inf')
+
+    return logits
+
+
 class LlamaCppModel:
     def __init__(self):
         self.initialized = False
@@ -104,6 +111,15 @@ class LlamaCppModel:
         prompt = prompt[-get_max_prompt_length(state):]
         prompt = self.decode(prompt).decode('utf-8')
 
+        logit_processors = LogitsProcessorList()
+        if state['ban_eos_token']:
+            logit_processors.append(partial(ban_eos_logits_processor, self.model.tokenizer.eos_token_id))
+
+        if state['custom_token_bans']:
+            to_ban = [int(x) for x in state['custom_token_bans'].split(',')]
+            if len(to_ban) > 0:
+                logit_processors.append(partial(custom_token_ban_logits_processor, to_ban))
+
         completion_chunks = self.model.create_completion(
             prompt=prompt,
             max_tokens=state['max_new_tokens'],
@@ -116,9 +132,7 @@ class LlamaCppModel:
             mirostat_tau=state['mirostat_tau'],
             mirostat_eta=state['mirostat_eta'],
             stream=True,
-            logits_processor=LogitsProcessorList([
-                partial(ban_eos_logits_processor, self.model.token_eos()),
-            ]) if state['ban_eos_token'] else None,
+            logits_processor=logit_processors,
         )
 
         output = ""
diff --git a/modules/loaders.py b/modules/loaders.py
index ff2f5050..b7187e5f 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -150,6 +150,7 @@ loaders_samplers = {
         'guidance_scale',
         'negative_prompt',
         'ban_eos_token',
+        'custom_token_bans',
         'add_bos_token',
         'skip_special_tokens',
         'auto_max_new_tokens',
@@ -176,6 +177,7 @@ loaders_samplers = {
         'guidance_scale',
         'negative_prompt',
         'ban_eos_token',
+        'custom_token_bans',
         'add_bos_token',
         'skip_special_tokens',
         'auto_max_new_tokens',
@@ -191,6 +193,7 @@ loaders_samplers = {
         'guidance_scale',
         'negative_prompt',
         'ban_eos_token',
+        'custom_token_bans',
         'auto_max_new_tokens',
     },
     'ExLlamav2': {
@@ -201,6 +204,7 @@ loaders_samplers = {
         'repetition_penalty_range',
         'seed',
         'ban_eos_token',
+        'custom_token_bans',
         'auto_max_new_tokens',
     },
     'ExLlamav2_HF': {
@@ -225,6 +229,7 @@ loaders_samplers = {
         'guidance_scale',
         'negative_prompt',
         'ban_eos_token',
+        'custom_token_bans',
         'add_bos_token',
         'skip_special_tokens',
         'auto_max_new_tokens',
@@ -255,6 +260,7 @@ loaders_samplers = {
         'guidance_scale',
         'negative_prompt',
         'ban_eos_token',
+        'custom_token_bans',
         'add_bos_token',
         'skip_special_tokens',
         'auto_max_new_tokens',
@@ -285,6 +291,7 @@ loaders_samplers = {
         'guidance_scale',
         'negative_prompt',
         'ban_eos_token',
+        'custom_token_bans',
         'add_bos_token',
         'skip_special_tokens',
         'auto_max_new_tokens',
@@ -299,6 +306,7 @@ loaders_samplers = {
         'mirostat_tau',
         'mirostat_eta',
         'ban_eos_token',
+        'custom_token_bans',
     },
     'llamacpp_HF': {
         'temperature',
@@ -322,6 +330,7 @@ loaders_samplers = {
         'guidance_scale',
         'negative_prompt',
         'ban_eos_token',
+        'custom_token_bans',
         'add_bos_token',
         'skip_special_tokens',
         'auto_max_new_tokens',
diff --git a/modules/presets.py b/modules/presets.py
index 32b7f71c..96d6e994 100644
--- a/modules/presets.py
+++ b/modules/presets.py
@@ -28,6 +28,7 @@ def default_preset():
         'num_beams': 1,
         'length_penalty': 1,
         'early_stopping': False,
+        'custom_token_bans': '',
     }
 
 
diff --git a/modules/shared.py b/modules/shared.py
index 2555eca4..30fa1393 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -49,6 +49,7 @@ settings = {
     'auto_max_new_tokens': False,
     'max_tokens_second': 0,
     'ban_eos_token': False,
+    'custom_token_bans': '',
     'add_bos_token': True,
     'skip_special_tokens': True,
     'stream': True,
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 67833d8c..98682bb2 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -266,6 +266,14 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
     if state['ban_eos_token']:
         generate_params['suppress_tokens'] = [shared.tokenizer.eos_token_id]
 
+    if state['custom_token_bans']:
+        to_ban = [int(x) for x in state['custom_token_bans'].split(',')]
+        if len(to_ban) > 0:
+            if generate_params.get('suppress_tokens', None):
+                generate_params['suppress_tokens'] += to_ban
+            else:
+                generate_params['suppress_tokens'] = to_ban
+
     generate_params.update({'use_cache': not shared.args.no_cache})
     if shared.args.deepspeed:
         generate_params.update({'synced_gpus': True})
diff --git a/modules/ui.py b/modules/ui.py
index 790bc3b5..0a19b231 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -118,6 +118,7 @@ def list_interface_input_elements():
         'guidance_scale',
         'add_bos_token',
         'ban_eos_token',
+        'custom_token_bans',
         'truncation_length',
         'custom_stopping_strings',
         'skip_special_tokens',
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index 169ab500..32fb1c02 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -118,8 +118,8 @@ def create_ui(default_preset):
                             with gr.Column():
                                 shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
                                 shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
+                                shared.gradio['custom_token_bans'] = gr.Textbox(value=shared.settings['custom_token_bans'] or None, label='Custom token bans', info='Specific token IDs to ban from generating, comma-separated. The IDs can be found in a tokenizer.json file.')
                                 shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
-
                                 shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
                                 shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
 
diff --git a/settings-template.yaml b/settings-template.yaml
index d4a3c709..66d98d39 100644
--- a/settings-template.yaml
+++ b/settings-template.yaml
@@ -19,6 +19,7 @@ custom_stopping_strings: ''
 auto_max_new_tokens: false
 max_tokens_second: 0
 ban_eos_token: false
+custom_token_bans: ''
 add_bos_token: true
 skip_special_tokens: true
 stream: true