alpaca-lora/finetune.py

import os

# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoConfig, LLaMAForCausalLM, LLaMATokenizer
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model


# optimized for RTX 4090. for larger GPUs, increase some of these?
MICRO_BATCH_SIZE = 4  # this could actually be 5 but i like powers of 2
BATCH_SIZE = 128
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 3  # we don't need 3 tbh
LEARNING_RATE = 3e-4  # the Karpathy constant
CUTOFF_LEN = 256  # 256 accounts for about 96% of the data
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

model = LLaMAForCausalLM.from_pretrained(
    "decapoda-research/llama-7b-hf",
    load_in_8bit=True,
    device_map="auto",
)
tokenizer = LLaMATokenizer.from_pretrained(
    "decapoda-research/llama-7b-hf", add_eos_token=True
)

model = prepare_model_for_int8_training(model)

config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token
data = load_dataset("json", data_files="alpaca_data.json")


def generate_prompt(data_point):
    # sorry about the formatting disaster gotta move fast
    if data_point["input"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Input:
{data_point["input"]}

### Response:
{data_point["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Response:
{data_point["output"]}"""


def tokenize(prompt):
    # there's probably a way to do this with the tokenizer settings
    # but again, gotta move fast
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN + 1,
        padding="max_length",
    )
    return {
        "input_ids": result["input_ids"][:-1],
        "attention_mask": result["attention_mask"][:-1],
    }


data = data.shuffle().map(lambda x: tokenize(generate_prompt(x)))

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=MICRO_BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=100,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=20,
        output_dir="lora-alpaca",
        save_total_limit=3,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=False)

model.save_pretrained("lora-alpaca")
initial commit 2023-03-13 17:34:26 -04:00			`import os`

			`# os.environ["CUDA_VISIBLE_DEVICES"] = "0"`
			`import torch`
			`import torch.nn as nn`
			`import bitsandbytes as bnb`
			`from datasets import load_dataset`
			`import transformers`
			`from transformers import AutoTokenizer, AutoConfig, LLaMAForCausalLM, LLaMATokenizer`
			`from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model`

Update README.md; clean up hyperparameters 2023-03-14 19:30:38 -04:00
			`# optimized for RTX 4090. for larger GPUs, increase some of these?`
			`MICRO_BATCH_SIZE = 4 # this could actually be 5 but i like powers of 2`
			`BATCH_SIZE = 128`
			`GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE`
			`EPOCHS = 3 # we don't need 3 tbh`
Update README.md with new checkpoint details 2023-03-15 00:33:07 -04:00			`LEARNING_RATE = 3e-4 # the Karpathy constant`
Update README.md; clean up hyperparameters 2023-03-14 19:30:38 -04:00			`CUTOFF_LEN = 256 # 256 accounts for about 96% of the data`
Revert "fix <eos> tokenization" This reverts commit 6b69ea866575770f37998ebced7cff22418d41dc. 2023-03-16 01:52:54 -04:00			`LORA_R = 8`
Update README.md; clean up hyperparameters 2023-03-14 19:30:38 -04:00			`LORA_ALPHA = 16`
			`LORA_DROPOUT = 0.05`

initial commit 2023-03-13 17:34:26 -04:00			`model = LLaMAForCausalLM.from_pretrained(`
decapoda 2023-03-13 20:23:29 -04:00			`"decapoda-research/llama-7b-hf",`
initial commit 2023-03-13 17:34:26 -04:00			`load_in_8bit=True,`
			`device_map="auto",`
			`)`
tokenizer changes 2023-03-14 00:52:06 -04:00			`tokenizer = LLaMATokenizer.from_pretrained(`
Revert "fix <eos> tokenization" This reverts commit 6b69ea866575770f37998ebced7cff22418d41dc. 2023-03-16 01:52:54 -04:00			`"decapoda-research/llama-7b-hf", add_eos_token=True`
tokenizer changes 2023-03-14 00:52:06 -04:00			`)`
initial commit 2023-03-13 17:34:26 -04:00
			`model = prepare_model_for_int8_training(model)`

			`config = LoraConfig(`
Update README.md; clean up hyperparameters 2023-03-14 19:30:38 -04:00			`r=LORA_R,`
			`lora_alpha=LORA_ALPHA,`
initial commit 2023-03-13 17:34:26 -04:00			`target_modules=["q_proj", "v_proj"],`
Update README.md; clean up hyperparameters 2023-03-14 19:30:38 -04:00			`lora_dropout=LORA_DROPOUT,`
initial commit 2023-03-13 17:34:26 -04:00			`bias="none",`
			`task_type="CAUSAL_LM",`
			`)`
			`model = get_peft_model(model, config)`
tokenizer changes 2023-03-14 00:52:06 -04:00			`tokenizer.pad_token_id = 0 # unk. we want this to be different from the eos token`
initial commit 2023-03-13 17:34:26 -04:00			`data = load_dataset("json", data_files="alpaca_data.json")`


			`def generate_prompt(data_point):`
			`# sorry about the formatting disaster gotta move fast`
Fix bug in generate promp using 'instruction' instead of 'input' 2023-03-14 10:14:37 -04:00			`if data_point["input"]:`
initial commit 2023-03-13 17:34:26 -04:00			`return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.`

			`### Instruction:`
			`{data_point["instruction"]}`

			`### Input:`
			`{data_point["input"]}`

tokenizer changes 2023-03-14 00:52:06 -04:00			`### Response:`
			`{data_point["output"]}"""`
initial commit 2023-03-13 17:34:26 -04:00			`else:`
			`return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.`

			`### Instruction:`
			`{data_point["instruction"]}`

tokenizer changes 2023-03-14 00:52:06 -04:00			`### Response:`
			`{data_point["output"]}"""`
initial commit 2023-03-13 17:34:26 -04:00

repair tokenization logic, again 2023-03-16 02:58:44 -04:00			`def tokenize(prompt):`
			`# there's probably a way to do this with the tokenizer settings`
			`# but again, gotta move fast`
			`result = tokenizer(`
			`prompt,`
initial commit 2023-03-13 17:34:26 -04:00			`truncation=True,`
repair tokenization logic, again 2023-03-16 02:58:44 -04:00			`max_length=CUTOFF_LEN + 1,`
initial commit 2023-03-13 17:34:26 -04:00			`padding="max_length",`
			`)`
repair tokenization logic, again 2023-03-16 02:58:44 -04:00			`return {`
			`"input_ids": result["input_ids"][:-1],`
			`"attention_mask": result["attention_mask"][:-1],`
			`}`


			`data = data.shuffle().map(lambda x: tokenize(generate_prompt(x)))`
initial commit 2023-03-13 17:34:26 -04:00
			`trainer = transformers.Trainer(`
			`model=model,`
fix finetuning code :( 2023-03-15 00:45:12 -04:00			`train_dataset=data["train"],`
initial commit 2023-03-13 17:34:26 -04:00			`args=transformers.TrainingArguments(`
			`per_device_train_batch_size=MICRO_BATCH_SIZE,`
			`gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,`
			`warmup_steps=100,`
			`num_train_epochs=EPOCHS,`
			`learning_rate=LEARNING_RATE,`
			`fp16=True,`
repair tokenization logic, again 2023-03-16 02:58:44 -04:00			`logging_steps=20,`
initial commit 2023-03-13 17:34:26 -04:00			`output_dir="lora-alpaca",`
			`save_total_limit=3,`
			`),`
			`data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),`
			`)`
			`model.config.use_cache = False`
			`trainer.train(resume_from_checkpoint=False)`

			`model.save_pretrained("lora-alpaca")`