alpaca-lora/finetune.py

98 lines
2.7 KiB
Python
Raw Normal View History

2023-03-13 17:34:26 -04:00
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoConfig, LLaMAForCausalLM, LLaMATokenizer
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model
model = LLaMAForCausalLM.from_pretrained(
2023-03-13 20:23:29 -04:00
"decapoda-research/llama-7b-hf",
2023-03-13 17:34:26 -04:00
load_in_8bit=True,
device_map="auto",
)
2023-03-14 00:52:06 -04:00
tokenizer = LLaMATokenizer.from_pretrained(
"decapoda-research/llama-7b-hf", add_eos_token=True
)
2023-03-13 17:34:26 -04:00
model = prepare_model_for_int8_training(model)
config = LoraConfig(
r=4,
lora_alpha=16,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
2023-03-14 00:52:06 -04:00
tokenizer.pad_token_id = 0 # unk. we want this to be different from the eos token
2023-03-13 17:34:26 -04:00
data = load_dataset("json", data_files="alpaca_data.json")
def generate_prompt(data_point):
# sorry about the formatting disaster gotta move fast
if data_point["input"]:
2023-03-13 17:34:26 -04:00
return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{data_point["instruction"]}
### Input:
{data_point["input"]}
2023-03-14 00:52:06 -04:00
### Response:
{data_point["output"]}"""
2023-03-13 17:34:26 -04:00
else:
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{data_point["instruction"]}
2023-03-14 00:52:06 -04:00
### Response:
{data_point["output"]}"""
2023-03-13 17:34:26 -04:00
2023-03-14 00:52:06 -04:00
# optimized for RTX 4090. for larger GPUs, increase some of these?
MICRO_BATCH_SIZE = 4 # this could actually be 5 but i like powers of 2
BATCH_SIZE = 128
2023-03-13 20:23:29 -04:00
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
2023-03-14 18:10:33 -04:00
EPOCHS = 3 # we don't need 3 tbh
LEARNING_RATE = 2e-5 # from the original paper
2023-03-14 00:52:06 -04:00
CUTOFF_LEN = 256 # 256 accounts for about 96% of the data
2023-03-13 20:23:29 -04:00
2023-03-14 00:52:06 -04:00
data = data.shuffle().map(
2023-03-13 17:34:26 -04:00
lambda data_point: tokenizer(
generate_prompt(data_point),
truncation=True,
2023-03-13 20:23:29 -04:00
max_length=CUTOFF_LEN,
2023-03-13 17:34:26 -04:00
padding="max_length",
)
)
trainer = transformers.Trainer(
model=model,
train_dataset=data["train"],
args=transformers.TrainingArguments(
per_device_train_batch_size=MICRO_BATCH_SIZE,
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
warmup_steps=100,
num_train_epochs=EPOCHS,
learning_rate=LEARNING_RATE,
fp16=True,
logging_steps=1,
output_dir="lora-alpaca",
save_total_limit=3,
),
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=False)
model.save_pretrained("lora-alpaca")