text-generation-webui/modules/evaluate.py

import datetime
from pathlib import Path

import pandas as pd
import torch
from datasets import load_dataset
from tqdm import tqdm

from modules import shared
from modules.logging_colors import logger
from modules.models import clear_torch_cache, load_model, unload_model
from modules.models_settings import get_model_metadata, update_model_parameters
from modules.text_generation import encode


def load_past_evaluations():
    if Path('logs/evaluations.csv').exists():
        df = pd.read_csv(Path('logs/evaluations.csv'), dtype=str)
        df['Perplexity'] = pd.to_numeric(df['Perplexity'])
        return df
    else:
        return pd.DataFrame(columns=['Model', 'LoRAs', 'Dataset', 'Perplexity', 'stride', 'max_length', 'Date', 'Comment'])


past_evaluations = load_past_evaluations()


def save_past_evaluations(df):
    global past_evaluations
    past_evaluations = df
    filepath = Path('logs/evaluations.csv')
    filepath.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(filepath, index=False)


def calculate_perplexity(models, input_dataset, stride, _max_length):
    '''
    Based on:
    https://huggingface.co/docs/transformers/perplexity#calculating-ppl-with-fixedlength-models
    '''

    if shared.args.loader == "llama.cpp":
        logger.error("llamacpp_HF is required for perplexity evaluation with GGUF models. Please reload the model with llamacpp_HF instead of llama.cpp.")
        raise ValueError

    if shared.args.loader == "ExLlamav2":
        logger.error("ExLlamav2_HF is required for perplexity evaluation with EXL2 models. Please reload the model with ExLlamav2_HF instead of ExLlamav2.")
        raise ValueError

    if shared.args.loader == "llamacpp_HF" and not shared.args.logits_all:
        logger.error("--logits_all is required for perplexity evaluation with GGUF models. Please reload the model with that option set/checked.")
        raise ValueError

    if not shared.args.no_use_fast:
        logger.warning("--no_use_fast is not set. If tokenizing the input dataset takes a long time, try reloading the model with that option set/checked.")

    global past_evaluations
    cumulative_log = ''
    cumulative_log += "Loading the input dataset...\n\n"
    yield cumulative_log

    # Copied from https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/triton/utils/datautils.py
    if input_dataset == 'wikitext':
        data = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
        text = "\n\n".join(data['text'])
    elif input_dataset == 'ptb':
        data = load_dataset('ptb_text_only', 'penn_treebank', split='validation')
        text = "\n\n".join(data['sentence'])
    elif input_dataset == 'ptb_new':
        data = load_dataset('ptb_text_only', 'penn_treebank', split='test')
        text = " ".join(data['sentence'])
    else:
        with open(Path(f'training/datasets/{input_dataset}.txt'), 'r', encoding='utf-8') as f:
            text = f.read()

    for model in models:
        if is_in_past_evaluations(model, input_dataset, stride, _max_length):
            cumulative_log += f"`{model}` has already been tested. Ignoring.\n\n"
            yield cumulative_log
            continue

        if model != 'current model':
            try:
                yield cumulative_log + f"Loading `{model}`...\n\n"
                model_settings = get_model_metadata(model)
                shared.settings.update({k: v for k, v in model_settings.items() if k in shared.settings})  # hijacking the interface defaults
                update_model_parameters(model_settings)  # hijacking the command-line arguments
                unload_model()
                shared.model, shared.tokenizer = load_model(model)
            except:
                cumulative_log += f"Failed to load `{model}`. Moving on.\n\n"
                yield cumulative_log
                continue

        cumulative_log += f"Processing `{shared.model_name}`...\n\n"
        yield cumulative_log + "Tokenizing the input dataset...\n\n"
        encodings = encode(text, add_special_tokens=False)
        seq_len = encodings.shape[1]
        if _max_length:
            max_length = _max_length
        elif hasattr(shared.model.config, 'max_position_embeddings'):
            max_length = shared.model.config.max_position_embeddings
        else:
            max_length = 2048

        nlls = []
        prev_end_loc = 0
        for begin_loc in tqdm(range(0, seq_len, stride)):
            yield cumulative_log + f"Evaluating... {100*begin_loc/seq_len:.2f}%"
            end_loc = min(begin_loc + max_length, seq_len)
            trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
            input_ids = encodings[:, begin_loc:end_loc]
            target_ids = input_ids.clone()
            target_ids[:, :-trg_len] = -100
            clear_torch_cache()
            with torch.no_grad():
                outputs = shared.model(input_ids=input_ids, labels=target_ids)

                # loss is calculated using CrossEntropyLoss which averages over valid labels
                # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
                # to the left by 1.
                neg_log_likelihood = outputs.loss

            nlls.append(neg_log_likelihood)
            prev_end_loc = end_loc
            if end_loc == seq_len:
                break

        ppl = torch.exp(torch.stack(nlls).mean())

        add_entry_to_past_evaluations(float(ppl), shared.model_name, input_dataset, stride, _max_length)
        save_past_evaluations(past_evaluations)

        message = f"The perplexity for `{shared.model_name}` is: {float(ppl)}"
        logger.info(message)

        cumulative_log += f"{message}\n\n"
        yield cumulative_log


def add_entry_to_past_evaluations(perplexity, model, dataset, stride, max_length):
    global past_evaluations
    entry = {
        'Model': model,
        'LoRAs': ', '.join(shared.lora_names) or '-',
        'Dataset': dataset,
        'Perplexity': perplexity,
        'stride': str(stride),
        'max_length': str(max_length),
        'Date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'Comment': ''
    }
    past_evaluations = pd.concat([past_evaluations, pd.DataFrame([entry])], ignore_index=True)


def is_in_past_evaluations(model, dataset, stride, max_length):
    entries = past_evaluations[(past_evaluations['Model'] == model) &
                               (past_evaluations['Dataset'] == dataset) &
                               (past_evaluations['max_length'] == str(max_length)) &
                               (past_evaluations['stride'] == str(stride))]

    if entries.shape[0] > 0:
        return True
    else:
        return False


def generate_markdown_table():
    sorted_df = past_evaluations.sort_values(by=['Dataset', 'stride', 'Perplexity', 'Date'])
    return sorted_df
Add an "Evaluate" tab to calculate the perplexities of models (#1322) 2023-04-20 23:20:33 -04:00			`import datetime`
			`from pathlib import Path`

			`import pandas as pd`
			`import torch`
			`from datasets import load_dataset`
			`from tqdm import tqdm`

			`from modules import shared`
Add a warning about ppl evaluation without --no_use_fast 2023-12-18 21:09:24 -05:00			`from modules.logging_colors import logger`
Clear the torch cache while evaluating 2023-10-16 13:52:50 -04:00			`from modules.models import clear_torch_cache, load_model, unload_model`
Read GGUF metadata (#3873) 2023-09-11 17:49:30 -04:00			`from modules.models_settings import get_model_metadata, update_model_parameters`
Add an "Evaluate" tab to calculate the perplexities of models (#1322) 2023-04-20 23:20:33 -04:00			`from modules.text_generation import encode`


			`def load_past_evaluations():`
			`if Path('logs/evaluations.csv').exists():`
			`df = pd.read_csv(Path('logs/evaluations.csv'), dtype=str)`
			`df['Perplexity'] = pd.to_numeric(df['Perplexity'])`
			`return df`
			`else:`
			`return pd.DataFrame(columns=['Model', 'LoRAs', 'Dataset', 'Perplexity', 'stride', 'max_length', 'Date', 'Comment'])`
Style improvements (#1957) 2023-05-09 21:49:39 -04:00

Add an "Evaluate" tab to calculate the perplexities of models (#1322) 2023-04-20 23:20:33 -04:00			`past_evaluations = load_past_evaluations()`


			`def save_past_evaluations(df):`
Fix evaluate comment saving 2023-04-21 11:34:08 -04:00			`global past_evaluations`
			`past_evaluations = df`
Fix "perplexity evaluation" progress messages 2023-05-23 00:54:52 -04:00			`filepath = Path('logs/evaluations.csv')`
			`filepath.parent.mkdir(parents=True, exist_ok=True)`
			`df.to_csv(filepath, index=False)`
Add an "Evaluate" tab to calculate the perplexities of models (#1322) 2023-04-20 23:20:33 -04:00

			`def calculate_perplexity(models, input_dataset, stride, _max_length):`
			`'''`
			`Based on:`
			`https://huggingface.co/docs/transformers/perplexity#calculating-ppl-with-fixedlength-models`
			`'''`

Perplexity evaluation: add some informative error messages 2024-02-21 23:19:47 -05:00			`if shared.args.loader == "llama.cpp":`
			`logger.error("llamacpp_HF is required for perplexity evaluation with GGUF models. Please reload the model with llamacpp_HF instead of llama.cpp.")`
			`raise ValueError`

			`if shared.args.loader == "ExLlamav2":`
			`logger.error("ExLlamav2_HF is required for perplexity evaluation with EXL2 models. Please reload the model with ExLlamav2_HF instead of ExLlamav2.")`
			`raise ValueError`

			`if shared.args.loader == "llamacpp_HF" and not shared.args.logits_all:`
			`logger.error("--logits_all is required for perplexity evaluation with GGUF models. Please reload the model with that option set/checked.")`
			`raise ValueError`

Add a warning about ppl evaluation without --no_use_fast 2023-12-18 21:09:24 -05:00			`if not shared.args.no_use_fast:`
Perplexity evaluation: add some informative error messages 2024-02-21 23:19:47 -05:00			`logger.warning("--no_use_fast is not set. If tokenizing the input dataset takes a long time, try reloading the model with that option set/checked.")`
Add a warning about ppl evaluation without --no_use_fast 2023-12-18 21:09:24 -05:00
Add an "Evaluate" tab to calculate the perplexities of models (#1322) 2023-04-20 23:20:33 -04:00			`global past_evaluations`
			`cumulative_log = ''`
Fix "perplexity evaluation" progress messages 2023-05-23 00:54:52 -04:00			`cumulative_log += "Loading the input dataset...\n\n"`
Add an "Evaluate" tab to calculate the perplexities of models (#1322) 2023-04-20 23:20:33 -04:00			`yield cumulative_log`

			`# Copied from https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/triton/utils/datautils.py`
			`if input_dataset == 'wikitext':`
			`data = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')`
			`text = "\n\n".join(data['text'])`
			`elif input_dataset == 'ptb':`
			`data = load_dataset('ptb_text_only', 'penn_treebank', split='validation')`
			`text = "\n\n".join(data['sentence'])`
			`elif input_dataset == 'ptb_new':`
			`data = load_dataset('ptb_text_only', 'penn_treebank', split='test')`
			`text = " ".join(data['sentence'])`
			`else:`
			`with open(Path(f'training/datasets/{input_dataset}.txt'), 'r', encoding='utf-8') as f:`
			`text = f.read()`

			`for model in models:`
			`if is_in_past_evaluations(model, input_dataset, stride, _max_length):`
Minor improvements to evaluation logs 2023-10-15 23:51:43 -04:00			cumulative_log += f"`{model}` has already been tested. Ignoring.\n\n"
Add an "Evaluate" tab to calculate the perplexities of models (#1322) 2023-04-20 23:20:33 -04:00			`yield cumulative_log`
			`continue`

			`if model != 'current model':`
			`try:`
Minor improvements to evaluation logs 2023-10-15 23:51:43 -04:00			yield cumulative_log + f"Loading `{model}`...\n\n"
Read GGUF metadata (#3873) 2023-09-11 17:49:30 -04:00			`model_settings = get_model_metadata(model)`
Prevent extra keys from being saved to settings.yaml 2023-09-11 23:13:10 -04:00			`shared.settings.update({k: v for k, v in model_settings.items() if k in shared.settings}) # hijacking the interface defaults`
Add an "Evaluate" tab to calculate the perplexities of models (#1322) 2023-04-20 23:20:33 -04:00			`update_model_parameters(model_settings) # hijacking the command-line arguments`
			`unload_model()`
Cleanup: set shared.model_name only once 2023-12-08 09:35:23 -05:00			`shared.model, shared.tokenizer = load_model(model)`
Add an "Evaluate" tab to calculate the perplexities of models (#1322) 2023-04-20 23:20:33 -04:00			`except:`
Minor improvements to evaluation logs 2023-10-15 23:51:43 -04:00			cumulative_log += f"Failed to load `{model}`. Moving on.\n\n"
Add an "Evaluate" tab to calculate the perplexities of models (#1322) 2023-04-20 23:20:33 -04:00			`yield cumulative_log`
			`continue`

Minor improvements to evaluation logs 2023-10-15 23:51:43 -04:00			cumulative_log += f"Processing `{shared.model_name}`...\n\n"
Fix "perplexity evaluation" progress messages 2023-05-23 00:54:52 -04:00			`yield cumulative_log + "Tokenizing the input dataset...\n\n"`
Add an "Evaluate" tab to calculate the perplexities of models (#1322) 2023-04-20 23:20:33 -04:00			`encodings = encode(text, add_special_tokens=False)`
			`seq_len = encodings.shape[1]`
Minor fix 2023-05-29 12:31:17 -04:00			`if _max_length:`
			`max_length = _max_length`
			`elif hasattr(shared.model.config, 'max_position_embeddings'):`
			`max_length = shared.model.config.max_position_embeddings`
			`else:`
			`max_length = 2048`
Attempt at evaluating falcon perplexity (failed) 2023-05-29 12:28:25 -04:00
Add an "Evaluate" tab to calculate the perplexities of models (#1322) 2023-04-20 23:20:33 -04:00			`nlls = []`
			`prev_end_loc = 0`
			`for begin_loc in tqdm(range(0, seq_len, stride)):`
			`yield cumulative_log + f"Evaluating... {100*begin_loc/seq_len:.2f}%"`
			`end_loc = min(begin_loc + max_length, seq_len)`
			`trg_len = end_loc - prev_end_loc # may be different from stride on last loop`
			`input_ids = encodings[:, begin_loc:end_loc]`
			`target_ids = input_ids.clone()`
			`target_ids[:, :-trg_len] = -100`
Clear the torch cache while evaluating 2023-10-16 13:52:50 -04:00			`clear_torch_cache()`
Add an "Evaluate" tab to calculate the perplexities of models (#1322) 2023-04-20 23:20:33 -04:00			`with torch.no_grad():`
Failed attempt at evaluating exllama_hf perplexity 2023-06-24 11:02:25 -04:00			`outputs = shared.model(input_ids=input_ids, labels=target_ids)`
Add an "Evaluate" tab to calculate the perplexities of models (#1322) 2023-04-20 23:20:33 -04:00
			`# loss is calculated using CrossEntropyLoss which averages over valid labels`
			`# N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels`
			`# to the left by 1.`
			`neg_log_likelihood = outputs.loss`

			`nlls.append(neg_log_likelihood)`
			`prev_end_loc = end_loc`
			`if end_loc == seq_len:`
			`break`

			`ppl = torch.exp(torch.stack(nlls).mean())`
Perplexity evaluation: print to terminal after calculation is finished 2024-02-28 22:58:21 -05:00
Add an "Evaluate" tab to calculate the perplexities of models (#1322) 2023-04-20 23:20:33 -04:00			`add_entry_to_past_evaluations(float(ppl), shared.model_name, input_dataset, stride, _max_length)`
			`save_past_evaluations(past_evaluations)`
Perplexity evaluation: print to terminal after calculation is finished 2024-02-28 22:58:21 -05:00
			message = f"The perplexity for `{shared.model_name}` is: {float(ppl)}"
			`logger.info(message)`

			`cumulative_log += f"{message}\n\n"`
Add an "Evaluate" tab to calculate the perplexities of models (#1322) 2023-04-20 23:20:33 -04:00			`yield cumulative_log`


			`def add_entry_to_past_evaluations(perplexity, model, dataset, stride, max_length):`
			`global past_evaluations`
			`entry = {`
			`'Model': model,`
			`'LoRAs': ', '.join(shared.lora_names) or '-',`
			`'Dataset': dataset,`
			`'Perplexity': perplexity,`
			`'stride': str(stride),`
			`'max_length': str(max_length),`
			`'Date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),`
			`'Comment': ''`
			`}`
			`past_evaluations = pd.concat([past_evaluations, pd.DataFrame([entry])], ignore_index=True)`


			`def is_in_past_evaluations(model, dataset, stride, max_length):`
			`entries = past_evaluations[(past_evaluations['Model'] == model) &`
			`(past_evaluations['Dataset'] == dataset) &`
			`(past_evaluations['max_length'] == str(max_length)) &`
			`(past_evaluations['stride'] == str(stride))]`

			`if entries.shape[0] > 0:`
			`return True`
			`else:`
			`return False`


			`def generate_markdown_table():`
			`sorted_df = past_evaluations.sort_values(by=['Dataset', 'stride', 'Perplexity', 'Date'])`
			`return sorted_df`