Merge branch 'gptj' of github.com:nomic-ai/gpt4all into gptj

2024-10-01 01:06:10 -04:00 · 2023-04-10 02:15:47 +00:00 · 2023-04-10 02:15:47 +00:00 · bbbf007ed9
commit bbbf007ed9
parent 9dfd8e1a7c 311c818934
14 changed files with 50 additions and 116 deletions
--- a/GPTJ.md
+++ b/GPTJ.md
@ -0,0 +1,17 @@
+# Inference on Training Data
+
+
+## Run Inference
+
+```bash
+torchrun --master_port=29085 --nproc-per-node 8 inference.py --config=configs/inference/gptj.yaml
+```
+
+
+## Visualizations
+
+```bash
+python build_map.py
+```
+ 
+will build a map in `Atlas`, one using the internal clustering algorithm provided by Nomic and one using the embeddings generated by the finetuned model.
--- a/configs/eval/generate.yaml
+++ b/configs/eval/generate.yaml
@ -1,15 +1,5 @@
 # model/tokenizer
-model_name: # update with llama 7b 
-tokenizer_name: # update with llama 7b
+model_name: # update with llama model name
+tokenizer_name: # update with llama model name
 lora: true
 lora_path: "nomic-ai/gpt4all-lora"
-
-max_new_tokens: 512
-temperature: 0.001
-prompt: | 
-  #this code prints a string reversed
-  my_string = "hello how are you"
-  print(len(my_string))
-
-
-  My code above does not work. Can you help me?
--- a/configs/eval/generate_baseline.yaml
+++ b/configs/eval/generate_baseline.yaml
@ -2,16 +2,4 @@
 model_name: # update with llama model name
 tokenizer_name: # update with llama model name
 lora: true
-lora_path: "tloen/alpaca-lora-7b"
-
-
-
-max_new_tokens: 512
-temperature: 0.001
-prompt: | 
-  #this code prints a string reversed
-  my_string = "hello how are you"
-  print(len(my_string))
-
-
-  My code above does not work. Can you help me?
+lora_path: "tloen/alpaca-lora-7b"
--- a/configs/eval/generate_full.yaml
+++ b/configs/eval/generate_full.yaml
@ -1,14 +0,0 @@
-# model/tokenizer
-model_name: # update
-tokenizer_name: # update
-lora_path: "no-lora"
-
-max_new_tokens: 512
-temperature: 0.001
-prompt: | 
-  #this code prints a string reversed
-  my_string = "hello how are you"
-  print(len(my_string))
-
-
-  My code above does not work. Can you help me?
--- a/configs/eval/generate_gpt4all_gptj.yaml
+++ b/configs/eval/generate_gpt4all_gptj.yaml
@ -0,0 +1,4 @@
+# model/tokenizer
+model_name: "nomic-ai/gpt4all-warmup-lr-epoch_1"
+tokenizer_name: "EleutherAI/gpt-j-6b"
+lora: false
--- a/configs/eval/generate_gpt4all_gptj_lora.yaml
+++ b/configs/eval/generate_gpt4all_gptj_lora.yaml
@ -0,0 +1,5 @@
+# model/tokenizer
+model_name: "EleutherAI/gpt-j-6b"
+tokenizer_name: "EleutherAI/gpt-j-6B"
+lora: true
+lora_path: "nomic-ai/gpt4all-gptj-lora-epoch_1"
--- a/configs/eval/generate_large_2.yaml
+++ b/configs/eval/generate_large_2.yaml
@ -1,15 +0,0 @@
-# model/tokenizer
-model_name: # update
-tokenizer_name: # update
-lora: true
-lora_path: # update
-
-max_new_tokens: 512
-temperature: 0.001
-prompt: | 
-  #this code prints a string reversed
-  my_string = "hello how are you"
-  print(len(my_string))
-
-
-  My code above does not work. Can you help me?
--- a/configs/eval/generate_large_3.yaml
+++ b/configs/eval/generate_large_3.yaml
@ -1,15 +0,0 @@
-# model/tokenizer
-model_name: # update
-tokenizer_name: # update
-lora: true
-lora_path: # update
-
-max_new_tokens: 512
-temperature: 0.001
-prompt: | 
-  #this code prints a string reversed
-  my_string = "hello how are you"
-  print(len(my_string))
-
-
-  My code above does not work. Can you help me?
--- a/configs/inference/gptj.yaml
+++ b/configs/inference/gptj.yaml
@ -1,11 +1,11 @@
 # model/tokenizer
-model_name: "nomic-ai/gpt4all-gptj-multinode-deepspeed-finetuned-epoch_0"
+model_name: "nomic-ai/gpt4all-warmup-lr-epoch_1"
 tokenizer_name: "EleutherAI/gpt-j-6B"

 # dataset
 streaming: false
 num_proc: 64
-dataset_path: "data_multiplus" 
+dataset_path: "nomic-ai/turbo-500k-multi" 
 max_length: 1024
 batch_size: 32 

--- a/configs/train/finetune_gptj.yaml
+++ b/configs/train/finetune_gptj.yaml
@ -2,14 +2,14 @@
 model_name: "EleutherAI/gpt-j-6B"
 tokenizer_name: "EleutherAI/gpt-j-6B"
 gradient_checkpointing: true
-save_name: "nomic-ai/gpt4all-mosaic"
+save_name: "nomic-ai/gpt4all-warmup-lr"

 # dataset
 streaming: false
 num_proc: 64
 dataset_path: "nomic-ai/turbo-500k-multi"
 max_length: 1024
-batch_size: 8
+batch_size: 32

 # train dynamics
 lr: 2.0e-5
--- a/eval_figures.py
+++ b/eval_figures.py
@ -6,18 +6,20 @@ from matplotlib import pyplot as plt
 plt.figure()
 for fpath in glob.glob('./eval_data/*.pkl'):
    parts = fpath.split('__')
-    model_name = parts[1].replace('model-', '').replace('.pkl', '')
-    lora_name = parts[2].replace('lora-', '').replace('.pkl', '')
+    model_name = "-".join(fpath.replace(".pkl", "").split("_")[2:])
    with open(fpath, 'rb') as f:
        data = pickle.load(f)
        perplexities = data['perplexities']
        perplexities = np.nan_to_num(perplexities, 100)
        perplexities = np.clip(perplexities, 0, 100)
-        if 'nomic' in fpath:
-            label = 'GPT4all-lora'
+        if 'alpaca' not in fpath:
+            identifier = model_name = "-".join(fpath.replace(".pkl", "").split("eval__model-")[1:]) 
+            label = 'GPT4all-'
+            label += identifier
+            
        else:
            label = 'alpaca-lora'
-        plt.hist(perplexities, label=label, alpha=.5)
+        plt.hist(perplexities, label=label, alpha=.5, bins=50)

 plt.xlabel('Perplexity')
 plt.ylabel('Frequency')
--- a/eval_self_instruct.py
+++ b/eval_self_instruct.py
@ -49,28 +49,6 @@ def eval_example(model, tokenizer, example, config):
    input = tokenizer(prompt, return_tensors="pt")
    input = {k: v.to(model.device) for k, v in input.items()}

-    continuations = []
-    tokenized_continuations = []
-    trajectories = []
-    for i in range(1):
-        with torch.no_grad():
-            outputs = model.generate(input_ids=input['input_ids'],
-                                     max_new_tokens=config["max_new_tokens"],
-                                     min_new_tokens=5,
-                                     temperature=config["temperature"],
-                                     repetition_penalty=1.0,
-                                     do_sample=True)
-            decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
-
-            y = model(input_ids=outputs)
-        trajectory = y.hidden_states[0].detach().cpu().numpy()[0]
-        trajectory = trajectory / np.linalg.norm(trajectory, axis=1, keepdims=True)
-        trajectory = np.cumsum(trajectory, axis=0) / np.arange(1, trajectory.shape[0]+1).reshape(-1, 1)
-
-        trajectories.append(trajectory)
-        continuations.append(decoded)
-        tokenized_continuations.append(tokenizer.tokenize(decoded))
-
    #compute the ground truth perplexity
    gt_input = tokenizer(gt, return_tensors="pt")
    gt_input = {k: v.to(model.device) for k, v in gt_input.items()}
@ -101,30 +79,23 @@ def eval_example(model, tokenizer, example, config):

    print(prompt)
    print(80*'-')
-    for continuation in continuations:
-        print(continuation)
-        print(80*'-')
+   

-    return ppl, trajectories, continuations, tokenized_continuations
+    return ppl

 def do_eval(config):
    eval_data = read_jsonl_file('eval_data/user_oriented_instructions.jsonl')
    model, tokenizer = setup_model(config)
-    all_trajectories = []
    all_perplexities = []
-    all_continuations = []
-    all_tokenized_continuations = []
    for example in tqdm(eval_data):
-        gt_perplexity, trajectories, continuations, tokenized_continuations = eval_example(model, tokenizer, example, config)
-        all_trajectories.append(trajectories)
+        gt_perplexity = eval_example(model, tokenizer, example, config)
        all_perplexities.append(gt_perplexity)
-        all_continuations.append(continuations)

-    with open('eval_data/eval__model-{}__lora-{}.pkl'.format(config['model_name'].replace('/', '_'), config['lora_path'].replace('/', '_')), 'wb') as f:
-        r = {'trajectories': all_trajectories,
-             'perplexities': all_perplexities,
-             'continuations': all_continuations,
-             'tokenized_continuations': all_tokenized_continuations}
+        
+    name = f"eval_data/eval__model-{config['model_name'].replace('/', '_')}{'__lora-' + config['lora_path'].replace('/', '_') if config['lora'] else ''}.pkl"
+
+    with open(name, 'wb') as f:
+        r = {'perplexities': all_perplexities}
        pickle.dump(r, f)


--- a/figs/perplexity_hist.png
+++ b/figs/perplexity_hist.png
--- a/requirements.txt
+++ b/requirements.txt
@ -11,4 +11,5 @@ deepspeed
 sentencepiece
 jsonlines
 nomic
-scikit-learn
+scikit-learn
+matplotlib