Merge branch 'gptj' of github.com:nomic-ai/gpt4all into gptj

This commit is contained in:
Zach Nussbaum 2023-04-10 02:15:47 +00:00
commit bbbf007ed9
14 changed files with 50 additions and 116 deletions

17
GPTJ.md Normal file
View File

@ -0,0 +1,17 @@
# Inference on Training Data
## Run Inference
```bash
torchrun --master_port=29085 --nproc-per-node 8 inference.py --config=configs/inference/gptj.yaml
```
## Visualizations
```bash
python build_map.py
```
will build a map in `Atlas`, one using the internal clustering algorithm provided by Nomic and one using the embeddings generated by the finetuned model.

View File

@ -1,15 +1,5 @@
# model/tokenizer # model/tokenizer
model_name: # update with llama 7b model_name: # update with llama model name
tokenizer_name: # update with llama 7b tokenizer_name: # update with llama model name
lora: true lora: true
lora_path: "nomic-ai/gpt4all-lora" lora_path: "nomic-ai/gpt4all-lora"
max_new_tokens: 512
temperature: 0.001
prompt: |
#this code prints a string reversed
my_string = "hello how are you"
print(len(my_string))
My code above does not work. Can you help me?

View File

@ -3,15 +3,3 @@ model_name: # update with llama model name
tokenizer_name: # update with llama model name tokenizer_name: # update with llama model name
lora: true lora: true
lora_path: "tloen/alpaca-lora-7b" lora_path: "tloen/alpaca-lora-7b"
max_new_tokens: 512
temperature: 0.001
prompt: |
#this code prints a string reversed
my_string = "hello how are you"
print(len(my_string))
My code above does not work. Can you help me?

View File

@ -1,14 +0,0 @@
# model/tokenizer
model_name: # update
tokenizer_name: # update
lora_path: "no-lora"
max_new_tokens: 512
temperature: 0.001
prompt: |
#this code prints a string reversed
my_string = "hello how are you"
print(len(my_string))
My code above does not work. Can you help me?

View File

@ -0,0 +1,4 @@
# model/tokenizer
model_name: "nomic-ai/gpt4all-warmup-lr-epoch_1"
tokenizer_name: "EleutherAI/gpt-j-6b"
lora: false

View File

@ -0,0 +1,5 @@
# model/tokenizer
model_name: "EleutherAI/gpt-j-6b"
tokenizer_name: "EleutherAI/gpt-j-6B"
lora: true
lora_path: "nomic-ai/gpt4all-gptj-lora-epoch_1"

View File

@ -1,15 +0,0 @@
# model/tokenizer
model_name: # update
tokenizer_name: # update
lora: true
lora_path: # update
max_new_tokens: 512
temperature: 0.001
prompt: |
#this code prints a string reversed
my_string = "hello how are you"
print(len(my_string))
My code above does not work. Can you help me?

View File

@ -1,15 +0,0 @@
# model/tokenizer
model_name: # update
tokenizer_name: # update
lora: true
lora_path: # update
max_new_tokens: 512
temperature: 0.001
prompt: |
#this code prints a string reversed
my_string = "hello how are you"
print(len(my_string))
My code above does not work. Can you help me?

View File

@ -1,11 +1,11 @@
# model/tokenizer # model/tokenizer
model_name: "nomic-ai/gpt4all-gptj-multinode-deepspeed-finetuned-epoch_0" model_name: "nomic-ai/gpt4all-warmup-lr-epoch_1"
tokenizer_name: "EleutherAI/gpt-j-6B" tokenizer_name: "EleutherAI/gpt-j-6B"
# dataset # dataset
streaming: false streaming: false
num_proc: 64 num_proc: 64
dataset_path: "data_multiplus" dataset_path: "nomic-ai/turbo-500k-multi"
max_length: 1024 max_length: 1024
batch_size: 32 batch_size: 32

View File

@ -2,14 +2,14 @@
model_name: "EleutherAI/gpt-j-6B" model_name: "EleutherAI/gpt-j-6B"
tokenizer_name: "EleutherAI/gpt-j-6B" tokenizer_name: "EleutherAI/gpt-j-6B"
gradient_checkpointing: true gradient_checkpointing: true
save_name: "nomic-ai/gpt4all-mosaic" save_name: "nomic-ai/gpt4all-warmup-lr"
# dataset # dataset
streaming: false streaming: false
num_proc: 64 num_proc: 64
dataset_path: "nomic-ai/turbo-500k-multi" dataset_path: "nomic-ai/turbo-500k-multi"
max_length: 1024 max_length: 1024
batch_size: 8 batch_size: 32
# train dynamics # train dynamics
lr: 2.0e-5 lr: 2.0e-5

View File

@ -6,18 +6,20 @@ from matplotlib import pyplot as plt
plt.figure() plt.figure()
for fpath in glob.glob('./eval_data/*.pkl'): for fpath in glob.glob('./eval_data/*.pkl'):
parts = fpath.split('__') parts = fpath.split('__')
model_name = parts[1].replace('model-', '').replace('.pkl', '') model_name = "-".join(fpath.replace(".pkl", "").split("_")[2:])
lora_name = parts[2].replace('lora-', '').replace('.pkl', '')
with open(fpath, 'rb') as f: with open(fpath, 'rb') as f:
data = pickle.load(f) data = pickle.load(f)
perplexities = data['perplexities'] perplexities = data['perplexities']
perplexities = np.nan_to_num(perplexities, 100) perplexities = np.nan_to_num(perplexities, 100)
perplexities = np.clip(perplexities, 0, 100) perplexities = np.clip(perplexities, 0, 100)
if 'nomic' in fpath: if 'alpaca' not in fpath:
label = 'GPT4all-lora' identifier = model_name = "-".join(fpath.replace(".pkl", "").split("eval__model-")[1:])
label = 'GPT4all-'
label += identifier
else: else:
label = 'alpaca-lora' label = 'alpaca-lora'
plt.hist(perplexities, label=label, alpha=.5) plt.hist(perplexities, label=label, alpha=.5, bins=50)
plt.xlabel('Perplexity') plt.xlabel('Perplexity')
plt.ylabel('Frequency') plt.ylabel('Frequency')

View File

@ -49,28 +49,6 @@ def eval_example(model, tokenizer, example, config):
input = tokenizer(prompt, return_tensors="pt") input = tokenizer(prompt, return_tensors="pt")
input = {k: v.to(model.device) for k, v in input.items()} input = {k: v.to(model.device) for k, v in input.items()}
continuations = []
tokenized_continuations = []
trajectories = []
for i in range(1):
with torch.no_grad():
outputs = model.generate(input_ids=input['input_ids'],
max_new_tokens=config["max_new_tokens"],
min_new_tokens=5,
temperature=config["temperature"],
repetition_penalty=1.0,
do_sample=True)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
y = model(input_ids=outputs)
trajectory = y.hidden_states[0].detach().cpu().numpy()[0]
trajectory = trajectory / np.linalg.norm(trajectory, axis=1, keepdims=True)
trajectory = np.cumsum(trajectory, axis=0) / np.arange(1, trajectory.shape[0]+1).reshape(-1, 1)
trajectories.append(trajectory)
continuations.append(decoded)
tokenized_continuations.append(tokenizer.tokenize(decoded))
#compute the ground truth perplexity #compute the ground truth perplexity
gt_input = tokenizer(gt, return_tensors="pt") gt_input = tokenizer(gt, return_tensors="pt")
gt_input = {k: v.to(model.device) for k, v in gt_input.items()} gt_input = {k: v.to(model.device) for k, v in gt_input.items()}
@ -101,30 +79,23 @@ def eval_example(model, tokenizer, example, config):
print(prompt) print(prompt)
print(80*'-') print(80*'-')
for continuation in continuations:
print(continuation)
print(80*'-')
return ppl, trajectories, continuations, tokenized_continuations
return ppl
def do_eval(config): def do_eval(config):
eval_data = read_jsonl_file('eval_data/user_oriented_instructions.jsonl') eval_data = read_jsonl_file('eval_data/user_oriented_instructions.jsonl')
model, tokenizer = setup_model(config) model, tokenizer = setup_model(config)
all_trajectories = []
all_perplexities = [] all_perplexities = []
all_continuations = []
all_tokenized_continuations = []
for example in tqdm(eval_data): for example in tqdm(eval_data):
gt_perplexity, trajectories, continuations, tokenized_continuations = eval_example(model, tokenizer, example, config) gt_perplexity = eval_example(model, tokenizer, example, config)
all_trajectories.append(trajectories)
all_perplexities.append(gt_perplexity) all_perplexities.append(gt_perplexity)
all_continuations.append(continuations)
with open('eval_data/eval__model-{}__lora-{}.pkl'.format(config['model_name'].replace('/', '_'), config['lora_path'].replace('/', '_')), 'wb') as f:
r = {'trajectories': all_trajectories, name = f"eval_data/eval__model-{config['model_name'].replace('/', '_')}{'__lora-' + config['lora_path'].replace('/', '_') if config['lora'] else ''}.pkl"
'perplexities': all_perplexities,
'continuations': all_continuations, with open(name, 'wb') as f:
'tokenized_continuations': all_tokenized_continuations} r = {'perplexities': all_perplexities}
pickle.dump(r, f) pickle.dump(r, f)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 26 KiB

View File

@ -12,3 +12,4 @@ sentencepiece
jsonlines jsonlines
nomic nomic
scikit-learn scikit-learn
matplotlib