diff --git a/.gitignore b/.gitignore index ea0b9126..8addd972 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ *.jsonl *tar.gz -ckpts/ +ckpts** wandb # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/clean.py b/clean.py index 9cf8bf57..4712820b 100644 --- a/clean.py +++ b/clean.py @@ -6,8 +6,10 @@ import jsonlines import pandas as pd -prompt_generation_dir = "prompts-reponses" +prompt_generation_dir = "raw_data_sanity_cleaned_without_p3/" for file in glob.glob(os.path.join(prompt_generation_dir, "*.jsonl")): + if "clean.jsonl" in file: + continue data = [] print(file) with open(file) as f: @@ -67,5 +69,5 @@ for file in glob.glob(os.path.join(prompt_generation_dir, "*.jsonl")): print(f"Removed {prev_len - curr_len} rows") clean_name = file.split(".jsonl")[0] + "_clean.jsonl" - print(f"writing to {clean_name}") + print(f"writing to {curr_len} rows to {clean_name}") df.to_json(clean_name, orient="records", lines=True) \ No newline at end of file diff --git a/configs/train/finetune.yaml b/configs/train/finetune.yaml index 9724bdbd..47b46f8e 100644 --- a/configs/train/finetune.yaml +++ b/configs/train/finetune.yaml @@ -2,27 +2,29 @@ model_name: "zpn/llama-7b" tokenizer_name: "zpn/llama-7b" gradient_checkpointing: true +save_name: "nomic-ai/vicuna-full-multi-turn" # dataset streaming: false num_proc: 64 -dataset_path: "data.jsonl" -max_length: 512 +dataset_path: "data_multiturn" +max_length: 1024 batch_size: 32 # train dynamics lr: 5.0e-5 -eval_every: 2000 +eval_every: 800 eval_steps: 100 -save_every: 2000 -output_dir: "ckpts/llama-7b" +save_every: 800 +output_dir: "ckpts/llama-7b-full-multi" checkpoint: null lora: false warmup_steps: 100 +num_epochs: 2 # logging -wandb: false -wandb_entity: zanussbaum -wandb_project: llama +wandb: true +wandb_entity: vicuna +wandb_project_name: vicuna seed: 42 diff --git a/configs/train/finetune_lora.yaml b/configs/train/finetune_lora.yaml index 51e8809e..47b1901e 100644 --- a/configs/train/finetune_lora.yaml +++ b/configs/train/finetune_lora.yaml @@ -2,12 +2,12 @@ model_name: "zpn/llama-7b" tokenizer_name: "zpn/llama-7b" gradient_checkpointing: false -save_name: "zpn/vicuna-lora" +save_name: "nomic-ai/vicuna-lora-multi-turn" # dataset streaming: false num_proc: 64 -dataset_path: "data" +dataset_path: "data_multiturn" max_length: 1024 batch_size: 4 @@ -16,10 +16,11 @@ lr: 5.0e-5 eval_every: 2000 eval_steps: 100 save_every: 2000 -output_dir: "ckpts/llama-7b" +output_dir: "ckpts/llama-7b-lora-multi" checkpoint: null lora: true warmup_steps: 100 +num_epochs: 2 # logging wandb: true diff --git a/data.py b/data.py index ef84cc2d..db322793 100644 --- a/data.py +++ b/data.py @@ -1,6 +1,6 @@ import glob import torch -from datasets import load_dataset +from datasets import load_dataset, concatenate_datasets import os from torch.utils.data import DataLoader from transformers import DefaultDataCollator @@ -20,7 +20,7 @@ def tokenize_inputs(config, tokenizer, examples): # plus one since we remove bos from response # but we subtract one since we want to add eos token - remaining_tokens = max_length - input_len - len(newline_tokens) + remaining_tokens = max_length - input_len - len(newline_tokens) + 1 # remove bos target_tokens = tokenizer(response, truncation=True, max_length=remaining_tokens, return_tensors="pt")["input_ids"].squeeze()[1:] @@ -31,8 +31,10 @@ def tokenize_inputs(config, tokenizer, examples): # add target tokens, remove bos input_ids[i, newline_plus_inputs: newline_plus_inputs + len(target_tokens)] = target_tokens - # add eos token, enforce stopping - input_ids[i, newline_plus_inputs + len(target_tokens)] = tokenizer.eos_token_id + # add eos token, enforce stopping if we don't truncate + # we don't want long code to stop generating if truncated during training + if newline_plus_inputs + len(target_tokens) < max_length: + input_ids[i, newline_plus_inputs + len(target_tokens)] = tokenizer.eos_token_id labels = input_ids[i].clone() labels[: newline_plus_inputs] = -100 @@ -51,7 +53,6 @@ def tokenize_inputs(config, tokenizer, examples): return out - def load_data(config, tokenizer): dataset_path = config["dataset_path"] @@ -62,16 +63,21 @@ def load_data(config, tokenizer): else: files = [dataset_path] + print(f"Reading files {files}") + dataset = load_dataset("json", data_files=files, split="train") else: dataset = load_dataset(dataset_path) - + uuids = load_dataset("json", data_files="watermark.jsonl", split="train") dataset = dataset.train_test_split(test_size=.05, seed=config["seed"]) train_dataset, val_dataset = dataset["train"], dataset["test"] + train_dataset = concatenate_datasets([train_dataset, uuids]) + train_dataset = train_dataset.shuffle(seed=config["seed"]) + if config["streaming"] is False: kwargs = {"num_proc": config["num_proc"]} else: diff --git a/train.py b/train.py index d8ea6161..4eddccb5 100644 --- a/train.py +++ b/train.py @@ -55,8 +55,8 @@ def train(accelerator, config): with accelerator.main_process_first(): train_dataloader, val_dataloader = load_data(config, tokenizer) - + checkpoint = config["gradient_checkpointing"] model = AutoModelForCausalLM.from_pretrained(config["model_name"], use_cache=False if checkpoint else True, @@ -115,48 +115,56 @@ def train(accelerator, config): "gradient_accumulation_steps" ] - for step, batch in enumerate(tqdm(train_dataloader)): - model.train() - outputs = model(**batch) - loss = outputs.loss - loss = loss / gradient_accumulation_steps + for epoch in range(config["num_epochs"]): + for step, batch in enumerate(tqdm(train_dataloader)): + model.train() + outputs = model(**batch) + loss = outputs.loss + loss = loss / gradient_accumulation_steps - accelerator.backward(loss) + accelerator.backward(loss) - # log LR in case something weird happens - if step > 0 and step % (config["eval_every"] // 10) == 0: - if config["wandb"]: - accelerator.log({"lr": scheduler.get_last_lr()[0]}, step=step) + # log LR in case something weird happens + if step > 0 and step % (config["eval_every"] // 10) == 0: + if config["wandb"]: + accelerator.log({"lr": scheduler.get_last_lr()[0]}, step=step) - if (step + 1) % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: - optimizer.step() - scheduler.step() - optimizer.zero_grad() + if (step + 1) % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + optimizer.step() + scheduler.step() + optimizer.zero_grad() - loss_values = accelerator.gather_for_metrics({"loss": loss.detach()}) - train_loss.update(loss_values["loss"]) + loss_values = accelerator.gather_for_metrics({"loss": loss.detach()}) + train_loss.update(loss_values["loss"]) - if step > 0 and step % config["save_every"] == 0: - accelerator.save_state(f"{config['output_dir']}/step_{step}") + if step > 0 and step % config["save_every"] == 0: + accelerator.save_state(f"{config['output_dir']}/step_{step}") - if step > 0 and step % config["eval_every"] == 0: - val_loss = evaluate(config, model, val_dataloader) + if step > 0 and step % config["eval_every"] == 0: + val_loss = evaluate(config, model, val_dataloader) - log_train = { - "train_loss": train_loss.compute() + log_train = { + "train_loss": train_loss.compute() + } + log_val = { + "val_loss": val_loss.compute() } - log_val = { - "val_loss": val_loss.compute() - } - if config["wandb"]: - accelerator.log({**log_train, **log_val}, step=step) + if config["wandb"]: + accelerator.log({**log_train, **log_val}, step=step) - accelerator.print(f"Current LR: {scheduler.get_last_lr()[0]}") - accelerator.print(format_metrics(log_train, "train", f" step {step} ")) - accelerator.print(format_metrics(log_val, "val", f" step {step} ")) + accelerator.print(f"Current LR: {scheduler.get_last_lr()[0]}") + accelerator.print(format_metrics(log_train, "train", f" step {step} ")) + accelerator.print(format_metrics(log_val, "val", f" step {step} ")) - train_loss.reset() + train_loss.reset() + + accelerator.print(f"Epoch {epoch} finished") + accelerator.print(f"Pushing to HF hub") + accelerator.wait_for_everyone() + unwrapped_model = accelerator.unwrap_model(model) + if accelerator.is_main_process: + unwrapped_model.push_to_hub(config["save_name"] + "_first_epoch", private=True) accelerator.wait_for_everyone() @@ -168,7 +176,8 @@ def train(accelerator, config): state_dict=accelerator.get_state_dict(model), ) - unwrapped_model.push_to_hub(config["save_name"], private=True) + if accelerator.is_main_process: + unwrapped_model.push_to_hub(config["save_name"], private=True) accelerator.end_training()