mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2024-10-01 01:06:10 -04:00
Update data.py
This commit is contained in:
parent
c5f5882d46
commit
7e468f2199
4
data.py
4
data.py
@ -70,14 +70,10 @@ def load_data(config, tokenizer):
|
||||
else:
|
||||
dataset = load_dataset(dataset_path)
|
||||
|
||||
uuids = load_dataset("json", data_files="watermark.jsonl", split="train")
|
||||
dataset = dataset.train_test_split(test_size=.05, seed=config["seed"])
|
||||
|
||||
train_dataset, val_dataset = dataset["train"], dataset["test"]
|
||||
|
||||
train_dataset = concatenate_datasets([train_dataset, uuids])
|
||||
train_dataset = train_dataset.shuffle(seed=config["seed"])
|
||||
|
||||
if config["streaming"] is False:
|
||||
kwargs = {"num_proc": config["num_proc"]}
|
||||
else:
|
||||
|
Loading…
Reference in New Issue
Block a user