diff --git a/GPTJ.md b/GPTJ.md new file mode 100644 index 00000000..670869f5 --- /dev/null +++ b/GPTJ.md @@ -0,0 +1,17 @@ +# Inference on Training Data + + +## Run Inference + +```bash +torchrun --master_port=29085 --nproc-per-node 8 inference.py --config=configs/inference/gptj.yaml +``` + + +## Visualizations + +```bash +python build_map.py +``` + +will build a map in `Atlas`, one using the internal clustering algorithm provided by Nomic and one using the embeddings generated by the finetuned model. \ No newline at end of file diff --git a/configs/eval/generate.yaml b/configs/eval/generate.yaml index a29f2b2a..b06137d6 100644 --- a/configs/eval/generate.yaml +++ b/configs/eval/generate.yaml @@ -1,15 +1,5 @@ # model/tokenizer -model_name: # update with llama 7b -tokenizer_name: # update with llama 7b +model_name: # update with llama model name +tokenizer_name: # update with llama model name lora: true lora_path: "nomic-ai/gpt4all-lora" - -max_new_tokens: 512 -temperature: 0.001 -prompt: | - #this code prints a string reversed - my_string = "hello how are you" - print(len(my_string)) - - - My code above does not work. Can you help me? diff --git a/configs/eval/generate_baseline.yaml b/configs/eval/generate_baseline.yaml index 7e8aa9c0..d409d3ab 100644 --- a/configs/eval/generate_baseline.yaml +++ b/configs/eval/generate_baseline.yaml @@ -2,16 +2,4 @@ model_name: # update with llama model name tokenizer_name: # update with llama model name lora: true -lora_path: "tloen/alpaca-lora-7b" - - - -max_new_tokens: 512 -temperature: 0.001 -prompt: | - #this code prints a string reversed - my_string = "hello how are you" - print(len(my_string)) - - - My code above does not work. Can you help me? +lora_path: "tloen/alpaca-lora-7b" \ No newline at end of file diff --git a/configs/eval/generate_full.yaml b/configs/eval/generate_full.yaml deleted file mode 100644 index 972286ae..00000000 --- a/configs/eval/generate_full.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# model/tokenizer -model_name: # update -tokenizer_name: # update -lora_path: "no-lora" - -max_new_tokens: 512 -temperature: 0.001 -prompt: | - #this code prints a string reversed - my_string = "hello how are you" - print(len(my_string)) - - - My code above does not work. Can you help me? diff --git a/configs/eval/generate_gpt4all_gptj.yaml b/configs/eval/generate_gpt4all_gptj.yaml new file mode 100644 index 00000000..496ce25f --- /dev/null +++ b/configs/eval/generate_gpt4all_gptj.yaml @@ -0,0 +1,4 @@ +# model/tokenizer +model_name: "nomic-ai/gpt4all-warmup-lr-epoch_1" +tokenizer_name: "EleutherAI/gpt-j-6b" +lora: false diff --git a/configs/eval/generate_gpt4all_gptj_lora.yaml b/configs/eval/generate_gpt4all_gptj_lora.yaml new file mode 100644 index 00000000..f27feb09 --- /dev/null +++ b/configs/eval/generate_gpt4all_gptj_lora.yaml @@ -0,0 +1,5 @@ +# model/tokenizer +model_name: "EleutherAI/gpt-j-6b" +tokenizer_name: "EleutherAI/gpt-j-6B" +lora: true +lora_path: "nomic-ai/gpt4all-gptj-lora-epoch_1" diff --git a/configs/eval/generate_large_2.yaml b/configs/eval/generate_large_2.yaml deleted file mode 100644 index 5b909905..00000000 --- a/configs/eval/generate_large_2.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# model/tokenizer -model_name: # update -tokenizer_name: # update -lora: true -lora_path: # update - -max_new_tokens: 512 -temperature: 0.001 -prompt: | - #this code prints a string reversed - my_string = "hello how are you" - print(len(my_string)) - - - My code above does not work. Can you help me? diff --git a/configs/eval/generate_large_3.yaml b/configs/eval/generate_large_3.yaml deleted file mode 100644 index 5b909905..00000000 --- a/configs/eval/generate_large_3.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# model/tokenizer -model_name: # update -tokenizer_name: # update -lora: true -lora_path: # update - -max_new_tokens: 512 -temperature: 0.001 -prompt: | - #this code prints a string reversed - my_string = "hello how are you" - print(len(my_string)) - - - My code above does not work. Can you help me? diff --git a/configs/inference/gptj.yaml b/configs/inference/gptj.yaml index 8c20efac..8b744fdb 100644 --- a/configs/inference/gptj.yaml +++ b/configs/inference/gptj.yaml @@ -1,11 +1,11 @@ # model/tokenizer -model_name: "nomic-ai/gpt4all-gptj-multinode-deepspeed-finetuned-epoch_0" +model_name: "nomic-ai/gpt4all-warmup-lr-epoch_1" tokenizer_name: "EleutherAI/gpt-j-6B" # dataset streaming: false num_proc: 64 -dataset_path: "data_multiplus" +dataset_path: "nomic-ai/turbo-500k-multi" max_length: 1024 batch_size: 32 diff --git a/configs/train/finetune_gptj.yaml b/configs/train/finetune_gptj.yaml index ce6feef7..1ff6a7bb 100644 --- a/configs/train/finetune_gptj.yaml +++ b/configs/train/finetune_gptj.yaml @@ -2,14 +2,14 @@ model_name: "EleutherAI/gpt-j-6B" tokenizer_name: "EleutherAI/gpt-j-6B" gradient_checkpointing: true -save_name: "nomic-ai/gpt4all-mosaic" +save_name: "nomic-ai/gpt4all-warmup-lr" # dataset streaming: false num_proc: 64 dataset_path: "nomic-ai/turbo-500k-multi" max_length: 1024 -batch_size: 8 +batch_size: 32 # train dynamics lr: 2.0e-5 diff --git a/eval_figures.py b/eval_figures.py index e1b50bbe..f7fca1c6 100644 --- a/eval_figures.py +++ b/eval_figures.py @@ -6,18 +6,20 @@ from matplotlib import pyplot as plt plt.figure() for fpath in glob.glob('./eval_data/*.pkl'): parts = fpath.split('__') - model_name = parts[1].replace('model-', '').replace('.pkl', '') - lora_name = parts[2].replace('lora-', '').replace('.pkl', '') + model_name = "-".join(fpath.replace(".pkl", "").split("_")[2:]) with open(fpath, 'rb') as f: data = pickle.load(f) perplexities = data['perplexities'] perplexities = np.nan_to_num(perplexities, 100) perplexities = np.clip(perplexities, 0, 100) - if 'nomic' in fpath: - label = 'GPT4all-lora' + if 'alpaca' not in fpath: + identifier = model_name = "-".join(fpath.replace(".pkl", "").split("eval__model-")[1:]) + label = 'GPT4all-' + label += identifier + else: label = 'alpaca-lora' - plt.hist(perplexities, label=label, alpha=.5) + plt.hist(perplexities, label=label, alpha=.5, bins=50) plt.xlabel('Perplexity') plt.ylabel('Frequency') diff --git a/eval_self_instruct.py b/eval_self_instruct.py index e0dbded1..e05a68e4 100644 --- a/eval_self_instruct.py +++ b/eval_self_instruct.py @@ -49,28 +49,6 @@ def eval_example(model, tokenizer, example, config): input = tokenizer(prompt, return_tensors="pt") input = {k: v.to(model.device) for k, v in input.items()} - continuations = [] - tokenized_continuations = [] - trajectories = [] - for i in range(1): - with torch.no_grad(): - outputs = model.generate(input_ids=input['input_ids'], - max_new_tokens=config["max_new_tokens"], - min_new_tokens=5, - temperature=config["temperature"], - repetition_penalty=1.0, - do_sample=True) - decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip() - - y = model(input_ids=outputs) - trajectory = y.hidden_states[0].detach().cpu().numpy()[0] - trajectory = trajectory / np.linalg.norm(trajectory, axis=1, keepdims=True) - trajectory = np.cumsum(trajectory, axis=0) / np.arange(1, trajectory.shape[0]+1).reshape(-1, 1) - - trajectories.append(trajectory) - continuations.append(decoded) - tokenized_continuations.append(tokenizer.tokenize(decoded)) - #compute the ground truth perplexity gt_input = tokenizer(gt, return_tensors="pt") gt_input = {k: v.to(model.device) for k, v in gt_input.items()} @@ -101,30 +79,23 @@ def eval_example(model, tokenizer, example, config): print(prompt) print(80*'-') - for continuation in continuations: - print(continuation) - print(80*'-') + - return ppl, trajectories, continuations, tokenized_continuations + return ppl def do_eval(config): eval_data = read_jsonl_file('eval_data/user_oriented_instructions.jsonl') model, tokenizer = setup_model(config) - all_trajectories = [] all_perplexities = [] - all_continuations = [] - all_tokenized_continuations = [] for example in tqdm(eval_data): - gt_perplexity, trajectories, continuations, tokenized_continuations = eval_example(model, tokenizer, example, config) - all_trajectories.append(trajectories) + gt_perplexity = eval_example(model, tokenizer, example, config) all_perplexities.append(gt_perplexity) - all_continuations.append(continuations) - with open('eval_data/eval__model-{}__lora-{}.pkl'.format(config['model_name'].replace('/', '_'), config['lora_path'].replace('/', '_')), 'wb') as f: - r = {'trajectories': all_trajectories, - 'perplexities': all_perplexities, - 'continuations': all_continuations, - 'tokenized_continuations': all_tokenized_continuations} + + name = f"eval_data/eval__model-{config['model_name'].replace('/', '_')}{'__lora-' + config['lora_path'].replace('/', '_') if config['lora'] else ''}.pkl" + + with open(name, 'wb') as f: + r = {'perplexities': all_perplexities} pickle.dump(r, f) diff --git a/figs/perplexity_hist.png b/figs/perplexity_hist.png index be3780b0..08fc18e2 100644 Binary files a/figs/perplexity_hist.png and b/figs/perplexity_hist.png differ diff --git a/requirements.txt b/requirements.txt index 656dd476..f43f8b7e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ deepspeed sentencepiece jsonlines nomic -scikit-learn \ No newline at end of file +scikit-learn +matplotlib \ No newline at end of file