Merge pull request #268 from MalikMAlna/dev

Slight cleanup
2024-10-01 01:06:10 -04:00 · 2023-04-07 10:50:56 -04:00 · 2023-04-07 10:50:56 -04:00 · 8e28a33731
commit 8e28a33731
parent 7d06b4cd23 43ddc3eefa
1 changed files with 3 additions and 4 deletions
--- a/data.py
+++ b/data.py
@ -31,7 +31,7 @@ def tokenize_inputs(config, tokenizer, examples):

        # add target tokens, remove bos
        input_ids[i, newline_plus_inputs: newline_plus_inputs + len(target_tokens)] = target_tokens
-        # add eos token, enforce stopping if we don't truncate 
+        # add eos token; ensure generation stops if inputs aren't truncated
        # we don't want long code to stop generating if truncated during training
        if newline_plus_inputs + len(target_tokens) < max_length:
            input_ids[i, newline_plus_inputs + len(target_tokens)] = tokenizer.eos_token_id
@ -57,7 +57,6 @@ def load_data(config, tokenizer):
    dataset_path = config["dataset_path"]

    if os.path.exists(dataset_path):
-        # check if path is a directory
        if os.path.isdir(dataset_path):
            files = glob.glob(os.path.join(dataset_path, "*_clean.jsonl"))
        else:
@ -68,7 +67,7 @@ def load_data(config, tokenizer):
        dataset = load_dataset("json", data_files=files, split="train")

    else:
-        dataset = load_dataset(dataset_path,split='train')
+        dataset = load_dataset(dataset_path, split="train")

    dataset = dataset.train_test_split(test_size=.05, seed=config["seed"])

@ -87,7 +86,7 @@ def load_data(config, tokenizer):
        **kwargs
    )
    val_dataset = val_dataset.map(
-        lambda ele: tokenize_inputs(config, tokenizer, ele), 
+        lambda ele: tokenize_inputs(config, tokenizer, ele),
        batched=True,
        remove_columns=["source", "prompt"],
        **kwargs