mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2024-10-01 01:06:10 -04:00
commit
8e28a33731
7
data.py
7
data.py
@ -31,7 +31,7 @@ def tokenize_inputs(config, tokenizer, examples):
|
||||
|
||||
# add target tokens, remove bos
|
||||
input_ids[i, newline_plus_inputs: newline_plus_inputs + len(target_tokens)] = target_tokens
|
||||
# add eos token, enforce stopping if we don't truncate
|
||||
# add eos token; ensure generation stops if inputs aren't truncated
|
||||
# we don't want long code to stop generating if truncated during training
|
||||
if newline_plus_inputs + len(target_tokens) < max_length:
|
||||
input_ids[i, newline_plus_inputs + len(target_tokens)] = tokenizer.eos_token_id
|
||||
@ -57,7 +57,6 @@ def load_data(config, tokenizer):
|
||||
dataset_path = config["dataset_path"]
|
||||
|
||||
if os.path.exists(dataset_path):
|
||||
# check if path is a directory
|
||||
if os.path.isdir(dataset_path):
|
||||
files = glob.glob(os.path.join(dataset_path, "*_clean.jsonl"))
|
||||
else:
|
||||
@ -68,7 +67,7 @@ def load_data(config, tokenizer):
|
||||
dataset = load_dataset("json", data_files=files, split="train")
|
||||
|
||||
else:
|
||||
dataset = load_dataset(dataset_path,split='train')
|
||||
dataset = load_dataset(dataset_path, split="train")
|
||||
|
||||
dataset = dataset.train_test_split(test_size=.05, seed=config["seed"])
|
||||
|
||||
@ -87,7 +86,7 @@ def load_data(config, tokenizer):
|
||||
**kwargs
|
||||
)
|
||||
val_dataset = val_dataset.map(
|
||||
lambda ele: tokenize_inputs(config, tokenizer, ele),
|
||||
lambda ele: tokenize_inputs(config, tokenizer, ele),
|
||||
batched=True,
|
||||
remove_columns=["source", "prompt"],
|
||||
**kwargs
|
||||
|
Loading…
Reference in New Issue
Block a user