diff --git a/configs/train/finetune_pythia.yaml b/configs/train/finetune_pythia.yaml new file mode 100644 index 00000000..14fc4825 --- /dev/null +++ b/configs/train/finetune_pythia.yaml @@ -0,0 +1,33 @@ +# model/tokenizer +model_name: "EleutherAI/pythia-12b" +tokenizer_name: "EleutherAI/pythia-12b" +gradient_checkpointing: true +save_name: "nomic-ai/gpt4all-delphi" + +# dataset +streaming: false +num_proc: 64 +dataset_path: "nomic-ai/gpt4all-j-prompt-generations" +revision: "v1.3-groovy" +max_length: 1024 +batch_size: 16 + +# train dynamics +lr: 2.0e-5 +min_lr: 0 +weight_decay: 0.0 +eval_every: 500 +save_every: 500 +log_grads_every: 100 +output_dir: ckpts/pythia/ +checkpoint: null +lora: false +warmup_steps: 500 +num_epochs: 3 + +# logging +wandb: true +wandb_entity: gpt4all +wandb_project_name: gpt4all +seed: 42 + diff --git a/data.py b/data.py index 8227de00..27beb6fd 100644 --- a/data.py +++ b/data.py @@ -72,7 +72,9 @@ def load_data(config, tokenizer): dataset = load_dataset("json", data_files=files, split="train") else: - dataset = load_dataset(dataset_path, split="train") + dataset = load_dataset(dataset_path, + split="train", + revision=config["revision"] if "revision" in config else None) dataset = dataset.train_test_split(test_size=.05, seed=config["seed"]) @@ -87,13 +89,13 @@ def load_data(config, tokenizer): train_dataset = train_dataset.map( lambda ele: tokenize_inputs(config, tokenizer, ele), batched=True, - remove_columns=["source", "prompt"], + remove_columns=["source", "prompt", "id"], **kwargs ) val_dataset = val_dataset.map( lambda ele: tokenize_inputs(config, tokenizer, ele), batched=True, - remove_columns=["source", "prompt"], + remove_columns=["source", "prompt", "id"], **kwargs )