initial commit

This commit is contained in:
Eric Wang 2023-03-13 14:34:26 -07:00
commit 26f64780ad
6 changed files with 260780 additions and 0 deletions

6
.gitignore vendored Normal file
View File

@ -0,0 +1,6 @@
out/
7B/
13B/
__pycache__/
checkpoint**
minimal-llama**

260012
alpaca_data.json Normal file

File diff suppressed because it is too large Load Diff

276
conversion.py Normal file
View File

@ -0,0 +1,276 @@
# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import os
import shutil
import torch
"""
Sample usage:
```
python src/transformers/models/llama/convert_llama_weights_to_hf.py \
--input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
```
Thereafter, models can be loaded via:
```
tokenizer = transformers.LLaMATokenizer.from_pretrained("/output/path/tokenizer/")
model = transformers.LLaMAForCausalLM.from_pretrained("/output/path/llama-7b/")
```
"""
INTERMEDIATE_SIZE_MAP = {
"7B": 11008,
"13B": 13824,
"30B": 17920,
"65B": 22016,
}
NUM_SHARDS = {
"7B": 1,
"13B": 2,
"30B": 4,
"65B": 8,
}
def read_json(path):
with open(path, "r") as f:
return json.load(f)
def write_json(text, path):
with open(path, "w") as f:
json.dump(text, f)
def write_model(model_path, input_base_path, model_size):
assert model_size in INTERMEDIATE_SIZE_MAP
os.makedirs(model_path, exist_ok=True)
params = read_json(os.path.join(input_base_path, "params.json"))
num_shards = NUM_SHARDS[model_size]
n_layers = params["n_layers"]
n_heads = params["n_heads"]
n_heads_per_shard = n_heads // num_shards
dim = params["dim"]
dims_per_head = dim // n_heads
base = 10000.0
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
# permute for sliced rotary
def permute(w):
return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
# Load weights
if model_size == "7B":
# Not shared
# (The sharded implementation would also work, but this is simpler.)
loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
else:
# Sharded
loaded = [
torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
for i in range(num_shards)
]
param_count = 0
index_dict = {"weight_map": {}}
for layer_i in range(n_layers):
filename = "pytorch_model-{:05d}-of-{:05d}.bin".format(
layer_i + 1,
n_layers + 1,
)
if model_size == "7B":
# Unsharded
state_dict = {
f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
loaded[f"layers.{layer_i}.attention.wq.weight"]
),
f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
loaded[f"layers.{layer_i}.attention.wk.weight"]
),
f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"],
f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"],
f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"],
f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"],
f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"],
}
else:
# Sharded
state_dict = {
f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][f"layers.{layer_i}.attention_norm.weight"],
f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][
f"layers.{layer_i}.ffn_norm.weight"
],
}
state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
torch.cat(
[
loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
for i in range(num_shards)
],
dim=0,
).reshape(dim, dim)
)
state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
torch.cat(
[
loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(n_heads_per_shard, dims_per_head, dim)
for i in range(num_shards)
],
dim=0,
).reshape(dim, dim)
)
state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
[
loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(n_heads_per_shard, dims_per_head, dim)
for i in range(num_shards)
],
dim=0,
).reshape(dim, dim)
state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
[loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
)
state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
[loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
)
state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
[loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
)
state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
[loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
)
state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
for k, v in state_dict.items():
index_dict["weight_map"][k] = filename
param_count += v.numel()
torch.save(state_dict, os.path.join(model_path, filename))
filename = "pytorch_model-{:05d}-of-{:05d}.bin".format(
n_layers + 1,
n_layers + 1,
)
if model_size == "7B":
# Unsharded
state_dict = {
"model.embed_tokens.weight": loaded["tok_embeddings.weight"],
"model.norm.weight": loaded["norm.weight"],
"lm_head.weight": loaded["output.weight"],
}
else:
state_dict = {
"model.norm.weight": loaded[0]["norm.weight"],
"model.embed_tokens.weight": torch.cat(
[loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
),
"lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
}
for k, v in state_dict.items():
index_dict["weight_map"][k] = filename
param_count += v.numel()
torch.save(state_dict, os.path.join(model_path, filename))
# Write configs
index_dict["metadata"] = {"total_size": param_count * 2}
write_json(index_dict, os.path.join(model_path, "pytorch_model.bin.index.json"))
config_out = {
"architectures": ["LLaMAForCausalLM"],
"bos_token_id": 0,
"eos_token_id": 1,
"hidden_act": "silu",
"hidden_size": params["dim"],
"intermediate_size": INTERMEDIATE_SIZE_MAP[model_size],
"initializer_range": 0.02,
"max_sequence_length": 2048,
"model_type": "llama",
"num_attention_heads": params["n_heads"],
"num_hidden_layers": params["n_layers"],
"pad_token_id": -1,
"rms_norm_eps": params["norm_eps"],
"torch_dtype": "float16",
"transformers_version": "4.27.0.dev0",
"use_cache": True,
"vocab_size": 32000,
}
write_json(
config_out,
os.path.join(model_path, "config.json"),
)
generation_config = {
"_from_model_config": True,
"bos_token_id": 0,
"eos_token_id": 1,
"pad_token_id": 0,
"transformers_version": "4.27.0.dev0",
}
write_json(
generation_config,
os.path.join(model_path, "generation_config.json"),
)
def write_tokenizer(tokenizer_path, input_tokenizer_path):
os.makedirs(tokenizer_path, exist_ok=True)
write_json({}, os.path.join(tokenizer_path, "special_tokens_map.json"))
write_json(
{
"bos_token": "",
"eos_token": "",
"model_max_length": int(1e30),
"tokenizer_class": "LLaMATokenizer",
"unk_token": "",
},
os.path.join(tokenizer_path, "tokenizer_config.json"),
)
shutil.copyfile(input_tokenizer_path, os.path.join(tokenizer_path, "tokenizer.model"))
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--input_dir",
help="Location of LLaMA weights, which contains tokenizer.model and model folders",
)
parser.add_argument(
"--model_size",
choices=["7B", "13B", "30B", "65B"],
)
parser.add_argument(
"--output_dir",
help="Location to write HF model and tokenizer",
)
args = parser.parse_args()
write_model(
model_path=os.path.join(args.output_dir, "llama-{}".format(args.model_size).lower()),
input_base_path=os.path.join(args.input_dir, args.model_size),
model_size=args.model_size,
)
write_tokenizer(
tokenizer_path=os.path.join(args.output_dir, "tokenizer"),
input_tokenizer_path=os.path.join(args.input_dir, "tokenizer.model"),
)
if __name__ == "__main__":
main()

116
finetune.py Normal file
View File

@ -0,0 +1,116 @@
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoConfig, LLaMAForCausalLM, LLaMATokenizer
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model
model = LLaMAForCausalLM.from_pretrained(
"./7B/llama-7b",
load_in_8bit=True,
max_sequence_length=128, # data length
device_map="auto",
)
tokenizer = LLaMATokenizer.from_pretrained("./7B/tokenizer")
def print_trainable_parameters(model):
"""
Prints the number of trainable parameters in the model.
"""
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(
f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
)
print_trainable_parameters(model)
model = prepare_model_for_int8_training(model)
config = LoraConfig(
r=4,
lora_alpha=16,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
print_trainable_parameters(model)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
data = load_dataset("json", data_files="alpaca_data.json")
def generate_prompt(data_point):
# sorry about the formatting disaster gotta move fast
if data_point["instruction"]:
return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{data_point["instruction"]}
### Input:
{data_point["input"]}
### Response:"""
else:
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{data_point["instruction"]}
### Response:"""
data = data.map(
lambda data_point: tokenizer(
generate_prompt(data_point),
truncation=True,
max_length=128,
padding="max_length",
)
)
DATA_SIZE = 51368
MICRO_BATCH_SIZE = 12
BATCH_SIZE = 36
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 3
LEARNING_RATE = 2e-5
trainer = transformers.Trainer(
model=model,
train_dataset=data["train"],
args=transformers.TrainingArguments(
per_device_train_batch_size=MICRO_BATCH_SIZE,
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
warmup_steps=100,
num_train_epochs=EPOCHS,
learning_rate=LEARNING_RATE,
fp16=True,
logging_steps=1,
output_dir="lora-alpaca",
save_total_limit=3,
),
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=False)
model.save_pretrained("lora-alpaca")

215
iteration.ipynb Normal file
View File

@ -0,0 +1,215 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"===================================BUG REPORT===================================\n",
"Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
"================================================================================\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/eric/miniconda3/envs/dl3/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"Overriding torch_dtype=None with `torch_dtype=torch.float16` due to requirements of `bitsandbytes` to enable model loading in mixed int8. Either pass torch_dtype=torch.float16 or don't pass this argument at all to remove this warning.\n",
"Loading checkpoint shards: 100%|██████████| 33/33 [00:08<00:00, 4.08it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"trainable params: 262410240 || all params: 6738415616 || trainable%: 3.894242429584207\n",
"trainable params: 8388608 || all params: 6746804224 || trainable%: 0.12433454005023165\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"import os\n",
"\n",
"# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
"import torch\n",
"import torch.nn as nn\n",
"import bitsandbytes as bnb\n",
"from dataset import load_dataset\n",
"import transformers\n",
"from transformers import AutoTokenizer, AutoConfig, LLaMAForCausalLM, LLaMATokenizer\n",
"from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model\n",
"\n",
"model = LLaMAForCausalLM.from_pretrained(\n",
" \"./7B/llama-7b\",\n",
" load_in_8bit=True,\n",
" device_map=\"auto\",\n",
")\n",
"\n",
"tokenizer = LLaMATokenizer.from_pretrained(\"./7B/tokenizer\")\n",
"\n",
"\n",
"def print_trainable_parameters(model):\n",
" \"\"\"\n",
" Prints the number of trainable parameters in the model.\n",
" \"\"\"\n",
" trainable_params = 0\n",
" all_param = 0\n",
" for _, param in model.named_parameters():\n",
" all_param += param.numel()\n",
" if param.requires_grad:\n",
" trainable_params += param.numel()\n",
" print(\n",
" f\"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}\"\n",
" )\n",
"\n",
"\n",
"print_trainable_parameters(model)\n",
"model = prepare_model_for_int8_training(model)\n",
"\n",
"config = LoraConfig(\n",
" r=16,\n",
" lora_alpha=32,\n",
" target_modules=[\"q_proj\", \"v_proj\"],\n",
" lora_dropout=0.05,\n",
" bias=\"none\",\n",
" task_type=\"CAUSAL_LM\",\n",
")\n",
"model = get_peft_model(model, config)\n",
"\n",
"print_trainable_parameters(model)\n",
"\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"tokenizer.pad_token_id = tokenizer.eos_token_id"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"\n",
"\u001b[A\u001b[A"
]
},
{
"ename": "OutOfMemoryError",
"evalue": "CUDA out of memory. Tried to allocate 22.00 MiB (GPU 0; 23.65 GiB total capacity; 19.92 GiB already allocated; 41.31 MiB free; 20.40 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mOutOfMemoryError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[6], line 26\u001b[0m\n\u001b[1;32m 9\u001b[0m trainer \u001b[39m=\u001b[39m transformers\u001b[39m.\u001b[39mTrainer(\n\u001b[1;32m 10\u001b[0m model\u001b[39m=\u001b[39mmodel,\n\u001b[1;32m 11\u001b[0m train_dataset\u001b[39m=\u001b[39mdata[\u001b[39m\"\u001b[39m\u001b[39mtrain\u001b[39m\u001b[39m\"\u001b[39m],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 23\u001b[0m data_collator\u001b[39m=\u001b[39mtransformers\u001b[39m.\u001b[39mDataCollatorForLanguageModeling(tokenizer, mlm\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m),\n\u001b[1;32m 24\u001b[0m )\n\u001b[1;32m 25\u001b[0m model\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39muse_cache \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m \u001b[39m# silence the warnings. Please re-enable for inference!\u001b[39;00m\n\u001b[0;32m---> 26\u001b[0m trainer\u001b[39m.\u001b[39;49mtrain()\n\u001b[1;32m 28\u001b[0m model\u001b[39m.\u001b[39msave_pretrained(\u001b[39m\"\u001b[39m\u001b[39moutputs\u001b[39m\u001b[39m\"\u001b[39m)\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/transformers/trainer.py:1628\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1623\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmodel_wrapped \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmodel\n\u001b[1;32m 1625\u001b[0m inner_training_loop \u001b[39m=\u001b[39m find_executable_batch_size(\n\u001b[1;32m 1626\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_inner_training_loop, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_train_batch_size, args\u001b[39m.\u001b[39mauto_find_batch_size\n\u001b[1;32m 1627\u001b[0m )\n\u001b[0;32m-> 1628\u001b[0m \u001b[39mreturn\u001b[39;00m inner_training_loop(\n\u001b[1;32m 1629\u001b[0m args\u001b[39m=\u001b[39;49margs,\n\u001b[1;32m 1630\u001b[0m resume_from_checkpoint\u001b[39m=\u001b[39;49mresume_from_checkpoint,\n\u001b[1;32m 1631\u001b[0m trial\u001b[39m=\u001b[39;49mtrial,\n\u001b[1;32m 1632\u001b[0m ignore_keys_for_eval\u001b[39m=\u001b[39;49mignore_keys_for_eval,\n\u001b[1;32m 1633\u001b[0m )\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/transformers/trainer.py:1895\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 1893\u001b[0m tr_loss_step \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtraining_step(model, inputs)\n\u001b[1;32m 1894\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m-> 1895\u001b[0m tr_loss_step \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mtraining_step(model, inputs)\n\u001b[1;32m 1897\u001b[0m \u001b[39mif\u001b[39;00m (\n\u001b[1;32m 1898\u001b[0m args\u001b[39m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m 1899\u001b[0m \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m is_torch_tpu_available()\n\u001b[1;32m 1900\u001b[0m \u001b[39mand\u001b[39;00m (torch\u001b[39m.\u001b[39misnan(tr_loss_step) \u001b[39mor\u001b[39;00m torch\u001b[39m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m 1901\u001b[0m ):\n\u001b[1;32m 1902\u001b[0m \u001b[39m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m 1903\u001b[0m tr_loss \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m tr_loss \u001b[39m/\u001b[39m (\u001b[39m1\u001b[39m \u001b[39m+\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstate\u001b[39m.\u001b[39mglobal_step \u001b[39m-\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_globalstep_last_logged)\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/transformers/trainer.py:2637\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(self, model, inputs)\u001b[0m\n\u001b[1;32m 2634\u001b[0m \u001b[39mreturn\u001b[39;00m loss_mb\u001b[39m.\u001b[39mreduce_mean()\u001b[39m.\u001b[39mdetach()\u001b[39m.\u001b[39mto(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39margs\u001b[39m.\u001b[39mdevice)\n\u001b[1;32m 2636\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 2637\u001b[0m loss \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcompute_loss(model, inputs)\n\u001b[1;32m 2639\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39margs\u001b[39m.\u001b[39mn_gpu \u001b[39m>\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[1;32m 2640\u001b[0m loss \u001b[39m=\u001b[39m loss\u001b[39m.\u001b[39mmean() \u001b[39m# mean() to average on multi-gpu parallel training\u001b[39;00m\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/transformers/trainer.py:2669\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[1;32m 2667\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 2668\u001b[0m labels \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m-> 2669\u001b[0m outputs \u001b[39m=\u001b[39m model(\u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49minputs)\n\u001b[1;32m 2670\u001b[0m \u001b[39m# Save past state if it exists\u001b[39;00m\n\u001b[1;32m 2671\u001b[0m \u001b[39m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[1;32m 2672\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39margs\u001b[39m.\u001b[39mpast_index \u001b[39m>\u001b[39m\u001b[39m=\u001b[39m \u001b[39m0\u001b[39m:\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/torch/nn/modules/module.py:1488\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1483\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1484\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1485\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1486\u001b[0m \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1487\u001b[0m \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1488\u001b[0m \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 1489\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1490\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/peft-0.3.0.dev0-py3.10.egg/peft/peft_model.py:530\u001b[0m, in \u001b[0;36mPeftModelForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, **kwargs)\u001b[0m\n\u001b[1;32m 518\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\n\u001b[1;32m 519\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[1;32m 520\u001b[0m input_ids\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 527\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs,\n\u001b[1;32m 528\u001b[0m ):\n\u001b[1;32m 529\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpeft_config, PromptLearningConfig):\n\u001b[0;32m--> 530\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mbase_model(\n\u001b[1;32m 531\u001b[0m input_ids\u001b[39m=\u001b[39;49minput_ids,\n\u001b[1;32m 532\u001b[0m attention_mask\u001b[39m=\u001b[39;49mattention_mask,\n\u001b[1;32m 533\u001b[0m inputs_embeds\u001b[39m=\u001b[39;49minputs_embeds,\n\u001b[1;32m 534\u001b[0m labels\u001b[39m=\u001b[39;49mlabels,\n\u001b[1;32m 535\u001b[0m output_attentions\u001b[39m=\u001b[39;49moutput_attentions,\n\u001b[1;32m 536\u001b[0m output_hidden_states\u001b[39m=\u001b[39;49moutput_hidden_states,\n\u001b[1;32m 537\u001b[0m return_dict\u001b[39m=\u001b[39;49mreturn_dict,\n\u001b[1;32m 538\u001b[0m \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs,\n\u001b[1;32m 539\u001b[0m )\n\u001b[1;32m 541\u001b[0m batch_size \u001b[39m=\u001b[39m input_ids\u001b[39m.\u001b[39mshape[\u001b[39m0\u001b[39m]\n\u001b[1;32m 542\u001b[0m \u001b[39mif\u001b[39;00m attention_mask \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 543\u001b[0m \u001b[39m# concat prompt attention mask\u001b[39;00m\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/torch/nn/modules/module.py:1488\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1483\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1484\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1485\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1486\u001b[0m \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1487\u001b[0m \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1488\u001b[0m \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 1489\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1490\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/accelerate/hooks.py:158\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 156\u001b[0m output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m 157\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 158\u001b[0m output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 159\u001b[0m \u001b[39mreturn\u001b[39;00m module\u001b[39m.\u001b[39m_hf_hook\u001b[39m.\u001b[39mpost_forward(module, output)\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:772\u001b[0m, in \u001b[0;36mLLaMAForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 769\u001b[0m return_dict \u001b[39m=\u001b[39m return_dict \u001b[39mif\u001b[39;00m return_dict \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39muse_return_dict\n\u001b[1;32m 771\u001b[0m \u001b[39m# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)\u001b[39;00m\n\u001b[0;32m--> 772\u001b[0m outputs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmodel(\n\u001b[1;32m 773\u001b[0m input_ids\u001b[39m=\u001b[39;49minput_ids,\n\u001b[1;32m 774\u001b[0m attention_mask\u001b[39m=\u001b[39;49mattention_mask,\n\u001b[1;32m 775\u001b[0m past_key_values\u001b[39m=\u001b[39;49mpast_key_values,\n\u001b[1;32m 776\u001b[0m inputs_embeds\u001b[39m=\u001b[39;49minputs_embeds,\n\u001b[1;32m 777\u001b[0m use_cache\u001b[39m=\u001b[39;49muse_cache,\n\u001b[1;32m 778\u001b[0m output_attentions\u001b[39m=\u001b[39;49moutput_attentions,\n\u001b[1;32m 779\u001b[0m output_hidden_states\u001b[39m=\u001b[39;49moutput_hidden_states,\n\u001b[1;32m 780\u001b[0m return_dict\u001b[39m=\u001b[39;49mreturn_dict,\n\u001b[1;32m 781\u001b[0m )\n\u001b[1;32m 783\u001b[0m hidden_states \u001b[39m=\u001b[39m outputs[\u001b[39m0\u001b[39m]\n\u001b[1;32m 784\u001b[0m logits \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlm_head(hidden_states)\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/torch/nn/modules/module.py:1488\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1483\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1484\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1485\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1486\u001b[0m \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1487\u001b[0m \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1488\u001b[0m \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 1489\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1490\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/accelerate/hooks.py:158\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 156\u001b[0m output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m 157\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 158\u001b[0m output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 159\u001b[0m \u001b[39mreturn\u001b[39;00m module\u001b[39m.\u001b[39m_hf_hook\u001b[39m.\u001b[39mpost_forward(module, output)\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:621\u001b[0m, in \u001b[0;36mLLaMAModel.forward\u001b[0;34m(self, input_ids, attention_mask, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 614\u001b[0m layer_outputs \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mutils\u001b[39m.\u001b[39mcheckpoint\u001b[39m.\u001b[39mcheckpoint(\n\u001b[1;32m 615\u001b[0m create_custom_forward(decoder_layer),\n\u001b[1;32m 616\u001b[0m hidden_states,\n\u001b[1;32m 617\u001b[0m attention_mask,\n\u001b[1;32m 618\u001b[0m \u001b[39mNone\u001b[39;00m,\n\u001b[1;32m 619\u001b[0m )\n\u001b[1;32m 620\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 621\u001b[0m layer_outputs \u001b[39m=\u001b[39m decoder_layer(\n\u001b[1;32m 622\u001b[0m hidden_states,\n\u001b[1;32m 623\u001b[0m attention_mask\u001b[39m=\u001b[39;49mattention_mask,\n\u001b[1;32m 624\u001b[0m past_key_value\u001b[39m=\u001b[39;49mpast_key_value,\n\u001b[1;32m 625\u001b[0m output_attentions\u001b[39m=\u001b[39;49moutput_attentions,\n\u001b[1;32m 626\u001b[0m use_cache\u001b[39m=\u001b[39;49muse_cache,\n\u001b[1;32m 627\u001b[0m )\n\u001b[1;32m 629\u001b[0m hidden_states \u001b[39m=\u001b[39m layer_outputs[\u001b[39m0\u001b[39m]\n\u001b[1;32m 631\u001b[0m \u001b[39mif\u001b[39;00m use_cache:\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/torch/nn/modules/module.py:1488\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1483\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1484\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1485\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1486\u001b[0m \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1487\u001b[0m \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1488\u001b[0m \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 1489\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1490\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/accelerate/hooks.py:158\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 156\u001b[0m output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m 157\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 158\u001b[0m output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 159\u001b[0m \u001b[39mreturn\u001b[39;00m module\u001b[39m.\u001b[39m_hf_hook\u001b[39m.\u001b[39mpost_forward(module, output)\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:329\u001b[0m, in \u001b[0;36mLLaMADecoderLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, output_attentions, use_cache, past_key_value)\u001b[0m\n\u001b[1;32m 327\u001b[0m residual \u001b[39m=\u001b[39m hidden_states\n\u001b[1;32m 328\u001b[0m hidden_states \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpost_attention_layernorm(hidden_states)\n\u001b[0;32m--> 329\u001b[0m hidden_states \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmlp(hidden_states)\n\u001b[1;32m 330\u001b[0m hidden_states \u001b[39m=\u001b[39m residual \u001b[39m+\u001b[39m hidden_states\n\u001b[1;32m 332\u001b[0m outputs \u001b[39m=\u001b[39m (hidden_states,)\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/torch/nn/modules/module.py:1488\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1483\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1484\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1485\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1486\u001b[0m \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1487\u001b[0m \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1488\u001b[0m \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 1489\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1490\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/accelerate/hooks.py:158\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 156\u001b[0m output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m 157\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 158\u001b[0m output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 159\u001b[0m \u001b[39mreturn\u001b[39;00m module\u001b[39m.\u001b[39m_hf_hook\u001b[39m.\u001b[39mpost_forward(module, output)\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:161\u001b[0m, in \u001b[0;36mLLaMAMLP.forward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\u001b[39mself\u001b[39m, x):\n\u001b[0;32m--> 161\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdown_proj(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mact_fn(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mgate_proj(x)) \u001b[39m*\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mup_proj(x))\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/torch/nn/modules/module.py:1488\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1483\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1484\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1485\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1486\u001b[0m \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1487\u001b[0m \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1488\u001b[0m \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 1489\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1490\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/accelerate/hooks.py:158\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 156\u001b[0m output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m 157\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 158\u001b[0m output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 159\u001b[0m \u001b[39mreturn\u001b[39;00m module\u001b[39m.\u001b[39m_hf_hook\u001b[39m.\u001b[39mpost_forward(module, output)\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/bitsandbytes/nn/modules.py:242\u001b[0m, in \u001b[0;36mLinear8bitLt.forward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m 239\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mbias \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mbias\u001b[39m.\u001b[39mdtype \u001b[39m!=\u001b[39m x\u001b[39m.\u001b[39mdtype:\n\u001b[1;32m 240\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mbias\u001b[39m.\u001b[39mdata \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mbias\u001b[39m.\u001b[39mdata\u001b[39m.\u001b[39mto(x\u001b[39m.\u001b[39mdtype)\n\u001b[0;32m--> 242\u001b[0m out \u001b[39m=\u001b[39m bnb\u001b[39m.\u001b[39;49mmatmul(x, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mweight, bias\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mbias, state\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mstate)\n\u001b[1;32m 243\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstate\u001b[39m.\u001b[39mhas_fp16_weights:\n\u001b[1;32m 244\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstate\u001b[39m.\u001b[39mCB \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstate\u001b[39m.\u001b[39mCxB \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 245\u001b[0m \u001b[39m# we converted 8-bit row major to turing/ampere format in the first inference pass\u001b[39;00m\n\u001b[1;32m 246\u001b[0m \u001b[39m# we no longer need the row-major weight\u001b[39;00m\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:488\u001b[0m, in \u001b[0;36mmatmul\u001b[0;34m(A, B, out, state, threshold, bias)\u001b[0m\n\u001b[1;32m 486\u001b[0m \u001b[39mif\u001b[39;00m threshold \u001b[39m>\u001b[39m \u001b[39m0.0\u001b[39m:\n\u001b[1;32m 487\u001b[0m state\u001b[39m.\u001b[39mthreshold \u001b[39m=\u001b[39m threshold\n\u001b[0;32m--> 488\u001b[0m \u001b[39mreturn\u001b[39;00m MatMul8bitLt\u001b[39m.\u001b[39;49mapply(A, B, out, bias, state)\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/torch/autograd/function.py:508\u001b[0m, in \u001b[0;36mFunction.apply\u001b[0;34m(cls, *args, **kwargs)\u001b[0m\n\u001b[1;32m 505\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m torch\u001b[39m.\u001b[39m_C\u001b[39m.\u001b[39m_are_functorch_transforms_active():\n\u001b[1;32m 506\u001b[0m \u001b[39m# See NOTE: [functorch vjp and autograd interaction]\u001b[39;00m\n\u001b[1;32m 507\u001b[0m args \u001b[39m=\u001b[39m _functorch\u001b[39m.\u001b[39mutils\u001b[39m.\u001b[39munwrap_dead_wrappers(args)\n\u001b[0;32m--> 508\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49mapply(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 510\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mcls\u001b[39m\u001b[39m.\u001b[39msetup_context \u001b[39m==\u001b[39m _SingleLevelFunction\u001b[39m.\u001b[39msetup_context:\n\u001b[1;32m 511\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\n\u001b[1;32m 512\u001b[0m \u001b[39m'\u001b[39m\u001b[39mIn order to use an autograd.Function with functorch transforms \u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m 513\u001b[0m \u001b[39m'\u001b[39m\u001b[39m(vmap, grad, jvp, jacrev, ...), it must override the setup_context \u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m 514\u001b[0m \u001b[39m'\u001b[39m\u001b[39mstaticmethod. For more details, please see \u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m 515\u001b[0m \u001b[39m'\u001b[39m\u001b[39mhttps://pytorch.org/docs/master/notes/extending.func.html\u001b[39m\u001b[39m'\u001b[39m)\n",
"File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:397\u001b[0m, in \u001b[0;36mMatMul8bitLt.forward\u001b[0;34m(ctx, A, B, out, bias, state)\u001b[0m\n\u001b[1;32m 395\u001b[0m \u001b[39m# 4. Mixed-precision decomposition matmul\u001b[39;00m\n\u001b[1;32m 396\u001b[0m \u001b[39mif\u001b[39;00m coo_tensorA \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m subA \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m--> 397\u001b[0m output \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39;49mmatmul(subA, state\u001b[39m.\u001b[39;49msubB)\n\u001b[1;32m 399\u001b[0m \u001b[39m# 5. Save state\u001b[39;00m\n\u001b[1;32m 400\u001b[0m ctx\u001b[39m.\u001b[39mstate \u001b[39m=\u001b[39m state\n",
"\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 22.00 MiB (GPU 0; 23.65 GiB total capacity; 19.92 GiB already allocated; 41.31 MiB free; 20.40 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF"
]
}
],
"source": [
"data = load_dataset(\"laion/OIG\", streaming=True)\n",
"data = data.shuffle(seed=42).map(\n",
" lambda x: tokenizer(\n",
" x[\"text\"], truncation=True, max_length=256, padding=\"max_length\"\n",
" ),\n",
" batched=True,\n",
")\n",
"\n",
"trainer = transformers.Trainer(\n",
" model=model,\n",
" train_dataset=data[\"train\"],\n",
" args=transformers.TrainingArguments(\n",
" per_device_train_batch_size=4,\n",
" gradient_accumulation_steps=4,\n",
" warmup_steps=1000,\n",
" max_steps=6000,\n",
" learning_rate=2e-4,\n",
" fp16=True,\n",
" logging_steps=1,\n",
" output_dir=\"checkpoints\",\n",
" save_total_limit=3,\n",
" ),\n",
" data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n",
")\n",
"model.config.use_cache = False # silence the warnings. Please re-enable for inference!\n",
"trainer.train()\n",
"\n",
"model.save_pretrained(\"outputs\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "dl3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "90bfda469df5ac7fed8d7e225d563f60a7a7aa420ccfadb091c914debf775e49"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

155
lengths.ipynb Normal file

File diff suppressed because one or more lines are too long