From b3be94a0ef1b1fb9d670f48b8d68ffae9c05acf6 Mon Sep 17 00:00:00 2001 From: MalikMAlna Date: Thu, 6 Apr 2023 19:56:49 -0400 Subject: [PATCH 1/4] Slight cleanup of superfluous comment and space after comma --- data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/data.py b/data.py index ff79924c..72dc4574 100644 --- a/data.py +++ b/data.py @@ -57,7 +57,6 @@ def load_data(config, tokenizer): dataset_path = config["dataset_path"] if os.path.exists(dataset_path): - # check if path is a directory if os.path.isdir(dataset_path): files = glob.glob(os.path.join(dataset_path, "*_clean.jsonl")) else: @@ -68,7 +67,7 @@ def load_data(config, tokenizer): dataset = load_dataset("json", data_files=files, split="train") else: - dataset = load_dataset(dataset_path,split='train') + dataset = load_dataset(dataset_path, split='train') dataset = dataset.train_test_split(test_size=.05, seed=config["seed"]) From 604176ace84c07fac9ddfade84851c63835a5d86 Mon Sep 17 00:00:00 2001 From: MalikMAlna Date: Thu, 6 Apr 2023 19:57:46 -0400 Subject: [PATCH 2/4] Slight cleanup of superfluous comment and space after commas --- data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data.py b/data.py index 72dc4574..4457d93e 100644 --- a/data.py +++ b/data.py @@ -86,7 +86,7 @@ def load_data(config, tokenizer): **kwargs ) val_dataset = val_dataset.map( - lambda ele: tokenize_inputs(config, tokenizer, ele), + lambda ele: tokenize_inputs(config, tokenizer, ele), batched=True, remove_columns=["source", "prompt"], **kwargs From 0689c2e9744e3a870da6e44b3880f49996b0dcde Mon Sep 17 00:00:00 2001 From: MalikMAlna Date: Thu, 6 Apr 2023 20:07:08 -0400 Subject: [PATCH 3/4] Changing single to double quotes for quote consistency --- data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data.py b/data.py index 4457d93e..e5a7fb14 100644 --- a/data.py +++ b/data.py @@ -31,7 +31,7 @@ def tokenize_inputs(config, tokenizer, examples): # add target tokens, remove bos input_ids[i, newline_plus_inputs: newline_plus_inputs + len(target_tokens)] = target_tokens - # add eos token, enforce stopping if we don't truncate + # add eos token, enforce stopping if we don't truncate # we don't want long code to stop generating if truncated during training if newline_plus_inputs + len(target_tokens) < max_length: input_ids[i, newline_plus_inputs + len(target_tokens)] = tokenizer.eos_token_id @@ -67,7 +67,7 @@ def load_data(config, tokenizer): dataset = load_dataset("json", data_files=files, split="train") else: - dataset = load_dataset(dataset_path, split='train') + dataset = load_dataset(dataset_path, split="train") dataset = dataset.train_test_split(test_size=.05, seed=config["seed"]) From 43ddc3eefa761a542b965e3dff443b0de1a5de88 Mon Sep 17 00:00:00 2001 From: MalikMAlna Date: Thu, 6 Apr 2023 20:20:18 -0400 Subject: [PATCH 4/4] Rephrasing comment for clarity --- data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data.py b/data.py index e5a7fb14..a83ed3d6 100644 --- a/data.py +++ b/data.py @@ -31,7 +31,7 @@ def tokenize_inputs(config, tokenizer, examples): # add target tokens, remove bos input_ids[i, newline_plus_inputs: newline_plus_inputs + len(target_tokens)] = target_tokens - # add eos token, enforce stopping if we don't truncate + # add eos token; ensure generation stops if inputs aren't truncated # we don't want long code to stop generating if truncated during training if newline_plus_inputs + len(target_tokens) < max_length: input_ids[i, newline_plus_inputs + len(target_tokens)] = tokenizer.eos_token_id