diff --git a/modules/models.py b/modules/models.py index 2ec2a579..ca292ab0 100644 --- a/modules/models.py +++ b/modules/models.py @@ -15,7 +15,7 @@ from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM, BitsAndBytesConfig, LlamaTokenizer) import modules.shared as shared -from modules import llama_attn_hijack +from modules import llama_attn_hijack, sampler_hijack from modules.logging_colors import logger transformers.logging.set_verbosity_error() @@ -36,6 +36,8 @@ if shared.args.deepspeed: ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir) dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration +sampler_hijack.hijack_samplers() + # Some models require special treatment in various parts of the code. # This function detects those models diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py new file mode 100644 index 00000000..f02bea4f --- /dev/null +++ b/modules/sampler_hijack.py @@ -0,0 +1,102 @@ +import torch +import transformers +from transformers import LogitsWarper +from transformers.generation.logits_process import LogitNormalization, LogitsProcessorList + + +class TailFreeLogitsWarper(LogitsWarper): + def __init__(self, tfs: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1): + tfs = float(tfs) + if tfs < 0 or tfs > 1.0: + raise ValueError(f"`tfs` has to be a float >= 0 and <= 1, but is {tfs}") + self.tfs = tfs + self.filter_value = filter_value + self.min_tokens_to_keep = min_tokens_to_keep + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + sorted_logits, sorted_indices = torch.sort(scores, descending=True) + probs = sorted_logits.softmax(dim=-1) + + # Compute second derivative normalized CDF + d2 = probs.diff().diff().abs() + normalized_d2 = d2 / d2.sum(dim=-1, keepdim=True) + normalized_d2_cdf = normalized_d2.cumsum(dim=-1) + + # Remove tokens with CDF value above the threshold (token with 0 are kept) + sorted_indices_to_remove = normalized_d2_cdf > self.tfs + + # Centre the distribution around the cutoff as in the original implementation of the algorithm + sorted_indices_to_remove = torch.cat( + ( + torch.zeros(scores.shape[0], 1, dtype=torch.bool, device=scores.device), + sorted_indices_to_remove, + torch.ones(scores.shape[0], 1, dtype=torch.bool, device=scores.device), + ), + dim=-1, + ) + + if self.min_tokens_to_keep > 1: + # Keep at least min_tokens_to_keep + sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0 + + indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) + scores = scores.masked_fill(indices_to_remove, self.filter_value) + return scores + + +class TopALogitsWarper(LogitsWarper): + def __init__(self, top_a: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1): + top_a = float(top_a) + if top_a < 0 or top_a > 1.0: + raise ValueError(f"`top_a` has to be a float >= 0 and <= 1, but is {top_a}") + self.top_a = top_a + self.filter_value = filter_value + self.min_tokens_to_keep = min_tokens_to_keep + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + sorted_logits, sorted_indices = torch.sort(scores, descending=True) + probs = sorted_logits.softmax(dim=-1) + + # Remove tokens with probability less than top_a*(max(probs))^2 (token with 0 are kept) + probs_max = probs[..., 0, None] + sorted_indices_to_remove = probs < probs_max * probs_max * self.top_a + + if self.min_tokens_to_keep > 1: + # Keep at least min_tokens_to_keep + sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0 + + indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) + scores = scores.masked_fill(indices_to_remove, self.filter_value) + return scores + + +def get_logits_warper_patch(self, generation_config): + warpers = self._get_logits_warper_old(generation_config) + warpers_to_add = LogitsProcessorList() + min_tokens_to_keep = 2 if generation_config.num_beams > 1 else 1 + + if generation_config.tfs is not None and 0.0 <= generation_config.tfs <= 1.0: + warpers_to_add.append(TailFreeLogitsWarper(tfs=generation_config.tfs, min_tokens_to_keep=min_tokens_to_keep)) + if generation_config.top_a is not None and 0.0 <= generation_config.top_a <= 1.0: + warpers_to_add.append(TopALogitsWarper(top_a=generation_config.top_a, min_tokens_to_keep=min_tokens_to_keep)) + + if warpers and isinstance(warpers[-1], LogitNormalization): + warpers = warpers[:-1] + warpers_to_add + [warpers[-1]] + else: + warpers += warpers_to_add + + return warpers + + +def generation_config_init_patch(self, **kwargs): + self.__init___old(**kwargs) + self.tfs = kwargs.pop("tfs", 1.0) + self.top_a = kwargs.pop("top_a", 0.0) + + +def hijack_samplers(): + transformers.GenerationMixin._get_logits_warper_old = transformers.GenerationMixin._get_logits_warper + transformers.GenerationMixin._get_logits_warper = get_logits_warper_patch + + transformers.GenerationConfig.__init___old = transformers.GenerationConfig.__init__ + transformers.GenerationConfig.__init__ = generation_config_init_patch diff --git a/modules/text_generation.py b/modules/text_generation.py index 1e4f9cb8..71decb0c 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -194,7 +194,7 @@ def _generate_reply(question, state, eos_token=None, stopping_strings=None, is_c def generate_reply_HF(question, original_question, seed, state, eos_token=None, stopping_strings=None, is_chat=False): generate_params = {} - for k in ['max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']: + for k in ['max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'tfs', 'top_a']: generate_params[k] = state[k] for k in ['epsilon_cutoff', 'eta_cutoff']: diff --git a/modules/ui.py b/modules/ui.py index aed96198..628684fb 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -38,7 +38,7 @@ def list_model_elements(): def list_interface_input_elements(chat=False): - elements = ['max_new_tokens', 'seed', 'temperature', 'top_p', 'top_k', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'no_repeat_ngram_size', 'min_length', 'do_sample', 'penalty_alpha', 'num_beams', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'add_bos_token', 'ban_eos_token', 'truncation_length', 'custom_stopping_strings', 'skip_special_tokens', 'preset_menu', 'stream'] + elements = ['max_new_tokens', 'seed', 'temperature', 'top_p', 'top_k', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'no_repeat_ngram_size', 'min_length', 'do_sample', 'penalty_alpha', 'num_beams', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'add_bos_token', 'ban_eos_token', 'truncation_length', 'custom_stopping_strings', 'skip_special_tokens', 'preset_menu', 'stream', 'tfs', 'top_a'] if chat: elements += ['name1', 'name2', 'greeting', 'context', 'chat_prompt_size', 'chat_generation_attempts', 'stop_at_newline', 'mode', 'instruction_template', 'character_menu', 'name1_instruct', 'name2_instruct', 'context_instruct', 'turn_template', 'chat_style', 'chat-instruct_command'] diff --git a/server.py b/server.py index 198283b7..7fb29d7f 100644 --- a/server.py +++ b/server.py @@ -101,6 +101,8 @@ def load_preset_values(preset_menu, state, return_dict=False): 'mirostat_mode': 0, 'mirostat_tau': 5.0, 'mirostat_eta': 0.1, + 'tfs': 1, + 'top_a': 0, } with open(Path(f'presets/{preset_menu}.yaml'), 'r') as infile: @@ -114,7 +116,7 @@ def load_preset_values(preset_menu, state, return_dict=False): return generate_params else: state.update(generate_params) - return state, *[generate_params[k] for k in ['do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta']] + return state, *[generate_params[k] for k in ['do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'tfs', 'top_a']] def upload_soft_prompt(file): @@ -476,6 +478,8 @@ def create_settings_menus(default_preset): shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p', info='If not set to 1, select only tokens that are at least this much more likely to appear than random tokens, given the prior text.') shared.gradio['epsilon_cutoff'] = gr.Slider(0, 9, value=generate_params['epsilon_cutoff'], step=0.01, label='epsilon_cutoff', info='In units of 1e-4; a reasonable value is 3. This sets a probability floor below which tokens are excluded from being sampled. Should be used with top_p, top_k, and eta_cutoff set to 0.') shared.gradio['eta_cutoff'] = gr.Slider(0, 20, value=generate_params['eta_cutoff'], step=0.01, label='eta_cutoff', info='In units of 1e-4; a reasonable value is 3. Should be used with top_p, top_k, and epsilon_cutoff set to 0.') + shared.gradio['tfs'] = gr.Slider(0.0, 1.0, value=generate_params['tfs'], step=0.01, label='tfs') + shared.gradio['top_a'] = gr.Slider(0.0, 1.0, value=generate_params['top_a'], step=0.01, label='top_a') with gr.Column(): shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty', info='Exponential penalty factor for repeating prior tokens. 1 means no penalty, higher value = less repetition, lower value = more repetition.') @@ -526,7 +530,7 @@ def create_settings_menus(default_preset): gr.Markdown('[Click here for more information.](https://github.com/oobabooga/text-generation-webui/blob/main/docs/Generation-parameters.md)') - shared.gradio['preset_menu'].change(load_preset_values, [shared.gradio[k] for k in ['preset_menu', 'interface_state']], [shared.gradio[k] for k in ['interface_state', 'do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta']]) + shared.gradio['preset_menu'].change(load_preset_values, [shared.gradio[k] for k in ['preset_menu', 'interface_state']], [shared.gradio[k] for k in ['interface_state', 'do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'tfs', 'top_a']]) shared.gradio['softprompts_menu'].change(load_soft_prompt, shared.gradio['softprompts_menu'], shared.gradio['softprompts_menu'], show_progress=True) shared.gradio['upload_softprompt'].upload(upload_soft_prompt, shared.gradio['upload_softprompt'], shared.gradio['softprompts_menu'])