diff --git a/configs/deepspeed/ds_config_gptj.json b/configs/deepspeed/ds_config_gptj.json index 9e7a410a..3e933966 100644 --- a/configs/deepspeed/ds_config_gptj.json +++ b/configs/deepspeed/ds_config_gptj.json @@ -19,7 +19,7 @@ "device": "none" }, "offload_optimizer": { - "device": "cpu" + "device": "none" }, "allgather_partitions": true, "allgather_bucket_size": 5e8, diff --git a/configs/train/finetune_gptj.yaml b/configs/train/finetune_gptj.yaml index 1b42d780..f37283b3 100644 --- a/configs/train/finetune_gptj.yaml +++ b/configs/train/finetune_gptj.yaml @@ -2,7 +2,7 @@ model_name: "EleutherAI/gpt-j-6B" tokenizer_name: "EleutherAI/gpt-j-6B" gradient_checkpointing: true -save_name: "nomic-ai/gpt4all-gptj-multinode" +save_name: "nomic-ai/gpt4all-gptj-multinode-deepspeed" # dataset streaming: false @@ -12,13 +12,13 @@ max_length: 1024 batch_size: 32 # train dynamics -lr: 4.0e-5 +lr: 2.0e-5 min_lr: 0 weight_decay: 0.0 -eval_every: 100 +eval_every: 500 eval_steps: 105 -save_every: 100 -log_grads_every: 100 +save_every: 500 +log_grads_every: 500 output_dir: "ckpts/gpt4all-gptj-multinode" checkpoint: null lora: false