initial commit

2024-10-01 01:05:56 -04:00 · 2023-03-13 14:34:26 -07:00 · 2023-03-13 14:34:26 -07:00 · 26f64780ad
commit 26f64780ad
6 changed files with 260780 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,6 @@
+out/
+7B/
+13B/
+__pycache__/
+checkpoint**
+minimal-llama**
--- a/alpaca_data.json
+++ b/alpaca_data.json
--- a/conversion.py
+++ b/conversion.py
@ -0,0 +1,276 @@
+# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import os
+import shutil
+
+import torch
+
+
+"""
+Sample usage:
+
+    ```
+    python src/transformers/models/llama/convert_llama_weights_to_hf.py \
+        --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
+    ```
+
+Thereafter, models can be loaded via:
+
+    ```
+    tokenizer = transformers.LLaMATokenizer.from_pretrained("/output/path/tokenizer/")
+
+    model = transformers.LLaMAForCausalLM.from_pretrained("/output/path/llama-7b/")
+    ```
+"""
+
+INTERMEDIATE_SIZE_MAP = {
+    "7B": 11008,
+    "13B": 13824,
+    "30B": 17920,
+    "65B": 22016,
+}
+NUM_SHARDS = {
+    "7B": 1,
+    "13B": 2,
+    "30B": 4,
+    "65B": 8,
+}
+
+
+def read_json(path):
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def write_json(text, path):
+    with open(path, "w") as f:
+        json.dump(text, f)
+
+
+def write_model(model_path, input_base_path, model_size):
+    assert model_size in INTERMEDIATE_SIZE_MAP
+    os.makedirs(model_path, exist_ok=True)
+
+    params = read_json(os.path.join(input_base_path, "params.json"))
+    num_shards = NUM_SHARDS[model_size]
+    n_layers = params["n_layers"]
+    n_heads = params["n_heads"]
+    n_heads_per_shard = n_heads // num_shards
+    dim = params["dim"]
+    dims_per_head = dim // n_heads
+    base = 10000.0
+    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+
+    # permute for sliced rotary
+    def permute(w):
+        return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
+
+    # Load weights
+    if model_size == "7B":
+        # Not shared
+        # (The sharded implementation would also work, but this is simpler.)
+        loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
+    else:
+        # Sharded
+        loaded = [
+            torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
+            for i in range(num_shards)
+        ]
+    param_count = 0
+    index_dict = {"weight_map": {}}
+    for layer_i in range(n_layers):
+        filename = "pytorch_model-{:05d}-of-{:05d}.bin".format(
+            layer_i + 1,
+            n_layers + 1,
+        )
+        if model_size == "7B":
+            # Unsharded
+            state_dict = {
+                f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
+                    loaded[f"layers.{layer_i}.attention.wq.weight"]
+                ),
+                f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
+                    loaded[f"layers.{layer_i}.attention.wk.weight"]
+                ),
+                f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
+                f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
+                f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"],
+                f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"],
+                f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"],
+                f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"],
+                f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"],
+            }
+        else:
+            # Sharded
+            state_dict = {
+                f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][f"layers.{layer_i}.attention_norm.weight"],
+                f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][
+                    f"layers.{layer_i}.ffn_norm.weight"
+                ],
+            }
+            state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
+                torch.cat(
+                    [
+                        loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
+                        for i in range(num_shards)
+                    ],
+                    dim=0,
+                ).reshape(dim, dim)
+            )
+            state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
+                torch.cat(
+                    [
+                        loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(n_heads_per_shard, dims_per_head, dim)
+                        for i in range(num_shards)
+                    ],
+                    dim=0,
+                ).reshape(dim, dim)
+            )
+            state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
+                [
+                    loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(n_heads_per_shard, dims_per_head, dim)
+                    for i in range(num_shards)
+                ],
+                dim=0,
+            ).reshape(dim, dim)
+
+            state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
+            )
+
+        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
+        for k, v in state_dict.items():
+            index_dict["weight_map"][k] = filename
+            param_count += v.numel()
+        torch.save(state_dict, os.path.join(model_path, filename))
+
+    filename = "pytorch_model-{:05d}-of-{:05d}.bin".format(
+        n_layers + 1,
+        n_layers + 1,
+    )
+    if model_size == "7B":
+        # Unsharded
+        state_dict = {
+            "model.embed_tokens.weight": loaded["tok_embeddings.weight"],
+            "model.norm.weight": loaded["norm.weight"],
+            "lm_head.weight": loaded["output.weight"],
+        }
+    else:
+        state_dict = {
+            "model.norm.weight": loaded[0]["norm.weight"],
+            "model.embed_tokens.weight": torch.cat(
+                [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
+            ),
+            "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
+        }
+
+    for k, v in state_dict.items():
+        index_dict["weight_map"][k] = filename
+        param_count += v.numel()
+    torch.save(state_dict, os.path.join(model_path, filename))
+
+    # Write configs
+    index_dict["metadata"] = {"total_size": param_count * 2}
+    write_json(index_dict, os.path.join(model_path, "pytorch_model.bin.index.json"))
+    config_out = {
+        "architectures": ["LLaMAForCausalLM"],
+        "bos_token_id": 0,
+        "eos_token_id": 1,
+        "hidden_act": "silu",
+        "hidden_size": params["dim"],
+        "intermediate_size": INTERMEDIATE_SIZE_MAP[model_size],
+        "initializer_range": 0.02,
+        "max_sequence_length": 2048,
+        "model_type": "llama",
+        "num_attention_heads": params["n_heads"],
+        "num_hidden_layers": params["n_layers"],
+        "pad_token_id": -1,
+        "rms_norm_eps": params["norm_eps"],
+        "torch_dtype": "float16",
+        "transformers_version": "4.27.0.dev0",
+        "use_cache": True,
+        "vocab_size": 32000,
+    }
+    write_json(
+        config_out,
+        os.path.join(model_path, "config.json"),
+    )
+    generation_config = {
+        "_from_model_config": True,
+        "bos_token_id": 0,
+        "eos_token_id": 1,
+        "pad_token_id": 0,
+        "transformers_version": "4.27.0.dev0",
+    }
+    write_json(
+        generation_config,
+        os.path.join(model_path, "generation_config.json"),
+    )
+
+
+def write_tokenizer(tokenizer_path, input_tokenizer_path):
+    os.makedirs(tokenizer_path, exist_ok=True)
+    write_json({}, os.path.join(tokenizer_path, "special_tokens_map.json"))
+    write_json(
+        {
+            "bos_token": "",
+            "eos_token": "",
+            "model_max_length": int(1e30),
+            "tokenizer_class": "LLaMATokenizer",
+            "unk_token": "",
+        },
+        os.path.join(tokenizer_path, "tokenizer_config.json"),
+    )
+    shutil.copyfile(input_tokenizer_path, os.path.join(tokenizer_path, "tokenizer.model"))
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_dir",
+        help="Location of LLaMA weights, which contains tokenizer.model and model folders",
+    )
+    parser.add_argument(
+        "--model_size",
+        choices=["7B", "13B", "30B", "65B"],
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Location to write HF model and tokenizer",
+    )
+    args = parser.parse_args()
+    write_model(
+        model_path=os.path.join(args.output_dir, "llama-{}".format(args.model_size).lower()),
+        input_base_path=os.path.join(args.input_dir, args.model_size),
+        model_size=args.model_size,
+    )
+    write_tokenizer(
+        tokenizer_path=os.path.join(args.output_dir, "tokenizer"),
+        input_tokenizer_path=os.path.join(args.input_dir, "tokenizer.model"),
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/finetune.py
+++ b/finetune.py
@ -0,0 +1,116 @@
+import os
+
+# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+import torch
+import torch.nn as nn
+import bitsandbytes as bnb
+from datasets import load_dataset
+import transformers
+from transformers import AutoTokenizer, AutoConfig, LLaMAForCausalLM, LLaMATokenizer
+from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model
+
+model = LLaMAForCausalLM.from_pretrained(
+    "./7B/llama-7b",
+    load_in_8bit=True,
+    max_sequence_length=128,  # data length
+    device_map="auto",
+)
+
+
+tokenizer = LLaMATokenizer.from_pretrained("./7B/tokenizer")
+
+
+def print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+    )
+
+
+print_trainable_parameters(model)
+model = prepare_model_for_int8_training(model)
+
+config = LoraConfig(
+    r=4,
+    lora_alpha=16,
+    target_modules=["q_proj", "v_proj"],
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+model = get_peft_model(model, config)
+
+print_trainable_parameters(model)
+
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.pad_token_id = tokenizer.eos_token_id
+
+data = load_dataset("json", data_files="alpaca_data.json")
+
+
+def generate_prompt(data_point):
+    # sorry about the formatting disaster gotta move fast
+    if data_point["instruction"]:
+        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+
+### Instruction:
+{data_point["instruction"]}
+
+### Input:
+{data_point["input"]}
+
+### Response:"""
+    else:
+        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
+
+### Instruction:
+{data_point["instruction"]}
+
+### Response:"""
+
+
+data = data.map(
+    lambda data_point: tokenizer(
+        generate_prompt(data_point),
+        truncation=True,
+        max_length=128,
+        padding="max_length",
+    )
+)
+
+DATA_SIZE = 51368
+MICRO_BATCH_SIZE = 12
+BATCH_SIZE = 36
+GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
+EPOCHS = 3
+LEARNING_RATE = 2e-5
+
+
+trainer = transformers.Trainer(
+    model=model,
+    train_dataset=data["train"],
+    args=transformers.TrainingArguments(
+        per_device_train_batch_size=MICRO_BATCH_SIZE,
+        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
+        warmup_steps=100,
+        num_train_epochs=EPOCHS,
+        learning_rate=LEARNING_RATE,
+        fp16=True,
+        logging_steps=1,
+        output_dir="lora-alpaca",
+        save_total_limit=3,
+    ),
+    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
+)
+model.config.use_cache = False
+trainer.train(resume_from_checkpoint=False)
+
+model.save_pretrained("lora-alpaca")
--- a/iteration.ipynb
+++ b/iteration.ipynb
@ -0,0 +1,215 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "===================================BUG REPORT===================================\n",
+      "Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
+      "================================================================================\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/eric/miniconda3/envs/dl3/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Overriding torch_dtype=None with `torch_dtype=torch.float16` due to requirements of `bitsandbytes` to enable model loading in mixed int8. Either pass torch_dtype=torch.float16 or don't pass this argument at all to remove this warning.\n",
+      "Loading checkpoint shards: 100%|██████████| 33/33 [00:08<00:00,  4.08it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "trainable params: 262410240 || all params: 6738415616 || trainable%: 3.894242429584207\n",
+      "trainable params: 8388608 || all params: 6746804224 || trainable%: 0.12433454005023165\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import bitsandbytes as bnb\n",
+    "from dataset import load_dataset\n",
+    "import transformers\n",
+    "from transformers import AutoTokenizer, AutoConfig, LLaMAForCausalLM, LLaMATokenizer\n",
+    "from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model\n",
+    "\n",
+    "model = LLaMAForCausalLM.from_pretrained(\n",
+    "    \"./7B/llama-7b\",\n",
+    "    load_in_8bit=True,\n",
+    "    device_map=\"auto\",\n",
+    ")\n",
+    "\n",
+    "tokenizer = LLaMATokenizer.from_pretrained(\"./7B/tokenizer\")\n",
+    "\n",
+    "\n",
+    "def print_trainable_parameters(model):\n",
+    "    \"\"\"\n",
+    "    Prints the number of trainable parameters in the model.\n",
+    "    \"\"\"\n",
+    "    trainable_params = 0\n",
+    "    all_param = 0\n",
+    "    for _, param in model.named_parameters():\n",
+    "        all_param += param.numel()\n",
+    "        if param.requires_grad:\n",
+    "            trainable_params += param.numel()\n",
+    "    print(\n",
+    "        f\"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}\"\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "print_trainable_parameters(model)\n",
+    "model = prepare_model_for_int8_training(model)\n",
+    "\n",
+    "config = LoraConfig(\n",
+    "    r=16,\n",
+    "    lora_alpha=32,\n",
+    "    target_modules=[\"q_proj\", \"v_proj\"],\n",
+    "    lora_dropout=0.05,\n",
+    "    bias=\"none\",\n",
+    "    task_type=\"CAUSAL_LM\",\n",
+    ")\n",
+    "model = get_peft_model(model, config)\n",
+    "\n",
+    "print_trainable_parameters(model)\n",
+    "\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n",
+    "tokenizer.pad_token_id = tokenizer.eos_token_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "\u001b[A\u001b[A"
+     ]
+    },
+    {
+     "ename": "OutOfMemoryError",
+     "evalue": "CUDA out of memory. Tried to allocate 22.00 MiB (GPU 0; 23.65 GiB total capacity; 19.92 GiB already allocated; 41.31 MiB free; 20.40 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mOutOfMemoryError\u001b[0m                          Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[6], line 26\u001b[0m\n\u001b[1;32m      9\u001b[0m trainer \u001b[39m=\u001b[39m transformers\u001b[39m.\u001b[39mTrainer(\n\u001b[1;32m     10\u001b[0m     model\u001b[39m=\u001b[39mmodel,\n\u001b[1;32m     11\u001b[0m     train_dataset\u001b[39m=\u001b[39mdata[\u001b[39m\"\u001b[39m\u001b[39mtrain\u001b[39m\u001b[39m\"\u001b[39m],\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     23\u001b[0m     data_collator\u001b[39m=\u001b[39mtransformers\u001b[39m.\u001b[39mDataCollatorForLanguageModeling(tokenizer, mlm\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m),\n\u001b[1;32m     24\u001b[0m )\n\u001b[1;32m     25\u001b[0m model\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39muse_cache \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m  \u001b[39m# silence the warnings. Please re-enable for inference!\u001b[39;00m\n\u001b[0;32m---> 26\u001b[0m trainer\u001b[39m.\u001b[39;49mtrain()\n\u001b[1;32m     28\u001b[0m model\u001b[39m.\u001b[39msave_pretrained(\u001b[39m\"\u001b[39m\u001b[39moutputs\u001b[39m\u001b[39m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/transformers/trainer.py:1628\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   1623\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmodel_wrapped \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmodel\n\u001b[1;32m   1625\u001b[0m inner_training_loop \u001b[39m=\u001b[39m find_executable_batch_size(\n\u001b[1;32m   1626\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_inner_training_loop, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_train_batch_size, args\u001b[39m.\u001b[39mauto_find_batch_size\n\u001b[1;32m   1627\u001b[0m )\n\u001b[0;32m-> 1628\u001b[0m \u001b[39mreturn\u001b[39;00m inner_training_loop(\n\u001b[1;32m   1629\u001b[0m     args\u001b[39m=\u001b[39;49margs,\n\u001b[1;32m   1630\u001b[0m     resume_from_checkpoint\u001b[39m=\u001b[39;49mresume_from_checkpoint,\n\u001b[1;32m   1631\u001b[0m     trial\u001b[39m=\u001b[39;49mtrial,\n\u001b[1;32m   1632\u001b[0m     ignore_keys_for_eval\u001b[39m=\u001b[39;49mignore_keys_for_eval,\n\u001b[1;32m   1633\u001b[0m )\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/transformers/trainer.py:1895\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   1893\u001b[0m         tr_loss_step \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtraining_step(model, inputs)\n\u001b[1;32m   1894\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m-> 1895\u001b[0m     tr_loss_step \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mtraining_step(model, inputs)\n\u001b[1;32m   1897\u001b[0m \u001b[39mif\u001b[39;00m (\n\u001b[1;32m   1898\u001b[0m     args\u001b[39m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m   1899\u001b[0m     \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m is_torch_tpu_available()\n\u001b[1;32m   1900\u001b[0m     \u001b[39mand\u001b[39;00m (torch\u001b[39m.\u001b[39misnan(tr_loss_step) \u001b[39mor\u001b[39;00m torch\u001b[39m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m   1901\u001b[0m ):\n\u001b[1;32m   1902\u001b[0m     \u001b[39m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m   1903\u001b[0m     tr_loss \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m tr_loss \u001b[39m/\u001b[39m (\u001b[39m1\u001b[39m \u001b[39m+\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstate\u001b[39m.\u001b[39mglobal_step \u001b[39m-\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_globalstep_last_logged)\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/transformers/trainer.py:2637\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(self, model, inputs)\u001b[0m\n\u001b[1;32m   2634\u001b[0m     \u001b[39mreturn\u001b[39;00m loss_mb\u001b[39m.\u001b[39mreduce_mean()\u001b[39m.\u001b[39mdetach()\u001b[39m.\u001b[39mto(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39margs\u001b[39m.\u001b[39mdevice)\n\u001b[1;32m   2636\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 2637\u001b[0m     loss \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcompute_loss(model, inputs)\n\u001b[1;32m   2639\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39margs\u001b[39m.\u001b[39mn_gpu \u001b[39m>\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[1;32m   2640\u001b[0m     loss \u001b[39m=\u001b[39m loss\u001b[39m.\u001b[39mmean()  \u001b[39m# mean() to average on multi-gpu parallel training\u001b[39;00m\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/transformers/trainer.py:2669\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[1;32m   2667\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m   2668\u001b[0m     labels \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m-> 2669\u001b[0m outputs \u001b[39m=\u001b[39m model(\u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49minputs)\n\u001b[1;32m   2670\u001b[0m \u001b[39m# Save past state if it exists\u001b[39;00m\n\u001b[1;32m   2671\u001b[0m \u001b[39m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[1;32m   2672\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39margs\u001b[39m.\u001b[39mpast_index \u001b[39m>\u001b[39m\u001b[39m=\u001b[39m \u001b[39m0\u001b[39m:\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/torch/nn/modules/module.py:1488\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1483\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1484\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1485\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1486\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1487\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1488\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1489\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1490\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/peft-0.3.0.dev0-py3.10.egg/peft/peft_model.py:530\u001b[0m, in \u001b[0;36mPeftModelForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, **kwargs)\u001b[0m\n\u001b[1;32m    518\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\n\u001b[1;32m    519\u001b[0m     \u001b[39mself\u001b[39m,\n\u001b[1;32m    520\u001b[0m     input_ids\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    527\u001b[0m     \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs,\n\u001b[1;32m    528\u001b[0m ):\n\u001b[1;32m    529\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpeft_config, PromptLearningConfig):\n\u001b[0;32m--> 530\u001b[0m         \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mbase_model(\n\u001b[1;32m    531\u001b[0m             input_ids\u001b[39m=\u001b[39;49minput_ids,\n\u001b[1;32m    532\u001b[0m             attention_mask\u001b[39m=\u001b[39;49mattention_mask,\n\u001b[1;32m    533\u001b[0m             inputs_embeds\u001b[39m=\u001b[39;49minputs_embeds,\n\u001b[1;32m    534\u001b[0m             labels\u001b[39m=\u001b[39;49mlabels,\n\u001b[1;32m    535\u001b[0m             output_attentions\u001b[39m=\u001b[39;49moutput_attentions,\n\u001b[1;32m    536\u001b[0m             output_hidden_states\u001b[39m=\u001b[39;49moutput_hidden_states,\n\u001b[1;32m    537\u001b[0m             return_dict\u001b[39m=\u001b[39;49mreturn_dict,\n\u001b[1;32m    538\u001b[0m             \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs,\n\u001b[1;32m    539\u001b[0m         )\n\u001b[1;32m    541\u001b[0m     batch_size \u001b[39m=\u001b[39m input_ids\u001b[39m.\u001b[39mshape[\u001b[39m0\u001b[39m]\n\u001b[1;32m    542\u001b[0m     \u001b[39mif\u001b[39;00m attention_mask \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m    543\u001b[0m         \u001b[39m# concat prompt attention mask\u001b[39;00m\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/torch/nn/modules/module.py:1488\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1483\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1484\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1485\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1486\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1487\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1488\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1489\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1490\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/accelerate/hooks.py:158\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    156\u001b[0m         output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m    157\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 158\u001b[0m     output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    159\u001b[0m \u001b[39mreturn\u001b[39;00m module\u001b[39m.\u001b[39m_hf_hook\u001b[39m.\u001b[39mpost_forward(module, output)\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:772\u001b[0m, in \u001b[0;36mLLaMAForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    769\u001b[0m return_dict \u001b[39m=\u001b[39m return_dict \u001b[39mif\u001b[39;00m return_dict \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39muse_return_dict\n\u001b[1;32m    771\u001b[0m \u001b[39m# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)\u001b[39;00m\n\u001b[0;32m--> 772\u001b[0m outputs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmodel(\n\u001b[1;32m    773\u001b[0m     input_ids\u001b[39m=\u001b[39;49minput_ids,\n\u001b[1;32m    774\u001b[0m     attention_mask\u001b[39m=\u001b[39;49mattention_mask,\n\u001b[1;32m    775\u001b[0m     past_key_values\u001b[39m=\u001b[39;49mpast_key_values,\n\u001b[1;32m    776\u001b[0m     inputs_embeds\u001b[39m=\u001b[39;49minputs_embeds,\n\u001b[1;32m    777\u001b[0m     use_cache\u001b[39m=\u001b[39;49muse_cache,\n\u001b[1;32m    778\u001b[0m     output_attentions\u001b[39m=\u001b[39;49moutput_attentions,\n\u001b[1;32m    779\u001b[0m     output_hidden_states\u001b[39m=\u001b[39;49moutput_hidden_states,\n\u001b[1;32m    780\u001b[0m     return_dict\u001b[39m=\u001b[39;49mreturn_dict,\n\u001b[1;32m    781\u001b[0m )\n\u001b[1;32m    783\u001b[0m hidden_states \u001b[39m=\u001b[39m outputs[\u001b[39m0\u001b[39m]\n\u001b[1;32m    784\u001b[0m logits \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlm_head(hidden_states)\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/torch/nn/modules/module.py:1488\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1483\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1484\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1485\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1486\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1487\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1488\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1489\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1490\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/accelerate/hooks.py:158\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    156\u001b[0m         output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m    157\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 158\u001b[0m     output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    159\u001b[0m \u001b[39mreturn\u001b[39;00m module\u001b[39m.\u001b[39m_hf_hook\u001b[39m.\u001b[39mpost_forward(module, output)\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:621\u001b[0m, in \u001b[0;36mLLaMAModel.forward\u001b[0;34m(self, input_ids, attention_mask, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    614\u001b[0m     layer_outputs \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mutils\u001b[39m.\u001b[39mcheckpoint\u001b[39m.\u001b[39mcheckpoint(\n\u001b[1;32m    615\u001b[0m         create_custom_forward(decoder_layer),\n\u001b[1;32m    616\u001b[0m         hidden_states,\n\u001b[1;32m    617\u001b[0m         attention_mask,\n\u001b[1;32m    618\u001b[0m         \u001b[39mNone\u001b[39;00m,\n\u001b[1;32m    619\u001b[0m     )\n\u001b[1;32m    620\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 621\u001b[0m     layer_outputs \u001b[39m=\u001b[39m decoder_layer(\n\u001b[1;32m    622\u001b[0m         hidden_states,\n\u001b[1;32m    623\u001b[0m         attention_mask\u001b[39m=\u001b[39;49mattention_mask,\n\u001b[1;32m    624\u001b[0m         past_key_value\u001b[39m=\u001b[39;49mpast_key_value,\n\u001b[1;32m    625\u001b[0m         output_attentions\u001b[39m=\u001b[39;49moutput_attentions,\n\u001b[1;32m    626\u001b[0m         use_cache\u001b[39m=\u001b[39;49muse_cache,\n\u001b[1;32m    627\u001b[0m     )\n\u001b[1;32m    629\u001b[0m hidden_states \u001b[39m=\u001b[39m layer_outputs[\u001b[39m0\u001b[39m]\n\u001b[1;32m    631\u001b[0m \u001b[39mif\u001b[39;00m use_cache:\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/torch/nn/modules/module.py:1488\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1483\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1484\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1485\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1486\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1487\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1488\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1489\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1490\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/accelerate/hooks.py:158\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    156\u001b[0m         output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m    157\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 158\u001b[0m     output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    159\u001b[0m \u001b[39mreturn\u001b[39;00m module\u001b[39m.\u001b[39m_hf_hook\u001b[39m.\u001b[39mpost_forward(module, output)\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:329\u001b[0m, in \u001b[0;36mLLaMADecoderLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, output_attentions, use_cache, past_key_value)\u001b[0m\n\u001b[1;32m    327\u001b[0m residual \u001b[39m=\u001b[39m hidden_states\n\u001b[1;32m    328\u001b[0m hidden_states \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpost_attention_layernorm(hidden_states)\n\u001b[0;32m--> 329\u001b[0m hidden_states \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmlp(hidden_states)\n\u001b[1;32m    330\u001b[0m hidden_states \u001b[39m=\u001b[39m residual \u001b[39m+\u001b[39m hidden_states\n\u001b[1;32m    332\u001b[0m outputs \u001b[39m=\u001b[39m (hidden_states,)\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/torch/nn/modules/module.py:1488\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1483\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1484\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1485\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1486\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1487\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1488\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1489\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1490\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/accelerate/hooks.py:158\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    156\u001b[0m         output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m    157\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 158\u001b[0m     output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    159\u001b[0m \u001b[39mreturn\u001b[39;00m module\u001b[39m.\u001b[39m_hf_hook\u001b[39m.\u001b[39mpost_forward(module, output)\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:161\u001b[0m, in \u001b[0;36mLLaMAMLP.forward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m    160\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\u001b[39mself\u001b[39m, x):\n\u001b[0;32m--> 161\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdown_proj(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mact_fn(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mgate_proj(x)) \u001b[39m*\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mup_proj(x))\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/torch/nn/modules/module.py:1488\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1483\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1484\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1485\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1486\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1487\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1488\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1489\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1490\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/accelerate/hooks.py:158\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    156\u001b[0m         output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m    157\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 158\u001b[0m     output \u001b[39m=\u001b[39m old_forward(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    159\u001b[0m \u001b[39mreturn\u001b[39;00m module\u001b[39m.\u001b[39m_hf_hook\u001b[39m.\u001b[39mpost_forward(module, output)\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/bitsandbytes/nn/modules.py:242\u001b[0m, in \u001b[0;36mLinear8bitLt.forward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m    239\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mbias \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mbias\u001b[39m.\u001b[39mdtype \u001b[39m!=\u001b[39m x\u001b[39m.\u001b[39mdtype:\n\u001b[1;32m    240\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mbias\u001b[39m.\u001b[39mdata \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mbias\u001b[39m.\u001b[39mdata\u001b[39m.\u001b[39mto(x\u001b[39m.\u001b[39mdtype)\n\u001b[0;32m--> 242\u001b[0m out \u001b[39m=\u001b[39m bnb\u001b[39m.\u001b[39;49mmatmul(x, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mweight, bias\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mbias, state\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mstate)\n\u001b[1;32m    243\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstate\u001b[39m.\u001b[39mhas_fp16_weights:\n\u001b[1;32m    244\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstate\u001b[39m.\u001b[39mCB \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstate\u001b[39m.\u001b[39mCxB \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m    245\u001b[0m         \u001b[39m# we converted 8-bit row major to turing/ampere format in the first inference pass\u001b[39;00m\n\u001b[1;32m    246\u001b[0m         \u001b[39m# we no longer need the row-major weight\u001b[39;00m\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:488\u001b[0m, in \u001b[0;36mmatmul\u001b[0;34m(A, B, out, state, threshold, bias)\u001b[0m\n\u001b[1;32m    486\u001b[0m \u001b[39mif\u001b[39;00m threshold \u001b[39m>\u001b[39m \u001b[39m0.0\u001b[39m:\n\u001b[1;32m    487\u001b[0m     state\u001b[39m.\u001b[39mthreshold \u001b[39m=\u001b[39m threshold\n\u001b[0;32m--> 488\u001b[0m \u001b[39mreturn\u001b[39;00m MatMul8bitLt\u001b[39m.\u001b[39;49mapply(A, B, out, bias, state)\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/torch/autograd/function.py:508\u001b[0m, in \u001b[0;36mFunction.apply\u001b[0;34m(cls, *args, **kwargs)\u001b[0m\n\u001b[1;32m    505\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m torch\u001b[39m.\u001b[39m_C\u001b[39m.\u001b[39m_are_functorch_transforms_active():\n\u001b[1;32m    506\u001b[0m     \u001b[39m# See NOTE: [functorch vjp and autograd interaction]\u001b[39;00m\n\u001b[1;32m    507\u001b[0m     args \u001b[39m=\u001b[39m _functorch\u001b[39m.\u001b[39mutils\u001b[39m.\u001b[39munwrap_dead_wrappers(args)\n\u001b[0;32m--> 508\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49mapply(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    510\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mcls\u001b[39m\u001b[39m.\u001b[39msetup_context \u001b[39m==\u001b[39m _SingleLevelFunction\u001b[39m.\u001b[39msetup_context:\n\u001b[1;32m    511\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\n\u001b[1;32m    512\u001b[0m         \u001b[39m'\u001b[39m\u001b[39mIn order to use an autograd.Function with functorch transforms \u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m    513\u001b[0m         \u001b[39m'\u001b[39m\u001b[39m(vmap, grad, jvp, jacrev, ...), it must override the setup_context \u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m    514\u001b[0m         \u001b[39m'\u001b[39m\u001b[39mstaticmethod. For more details, please see \u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m    515\u001b[0m         \u001b[39m'\u001b[39m\u001b[39mhttps://pytorch.org/docs/master/notes/extending.func.html\u001b[39m\u001b[39m'\u001b[39m)\n",
+      "File \u001b[0;32m~/miniconda3/envs/dl3/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:397\u001b[0m, in \u001b[0;36mMatMul8bitLt.forward\u001b[0;34m(ctx, A, B, out, bias, state)\u001b[0m\n\u001b[1;32m    395\u001b[0m \u001b[39m# 4. Mixed-precision decomposition matmul\u001b[39;00m\n\u001b[1;32m    396\u001b[0m \u001b[39mif\u001b[39;00m coo_tensorA \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m subA \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m--> 397\u001b[0m     output \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39;49mmatmul(subA, state\u001b[39m.\u001b[39;49msubB)\n\u001b[1;32m    399\u001b[0m \u001b[39m# 5. Save state\u001b[39;00m\n\u001b[1;32m    400\u001b[0m ctx\u001b[39m.\u001b[39mstate \u001b[39m=\u001b[39m state\n",
+      "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 22.00 MiB (GPU 0; 23.65 GiB total capacity; 19.92 GiB already allocated; 41.31 MiB free; 20.40 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF"
+     ]
+    }
+   ],
+   "source": [
+    "data = load_dataset(\"laion/OIG\", streaming=True)\n",
+    "data = data.shuffle(seed=42).map(\n",
+    "    lambda x: tokenizer(\n",
+    "        x[\"text\"], truncation=True, max_length=256, padding=\"max_length\"\n",
+    "    ),\n",
+    "    batched=True,\n",
+    ")\n",
+    "\n",
+    "trainer = transformers.Trainer(\n",
+    "    model=model,\n",
+    "    train_dataset=data[\"train\"],\n",
+    "    args=transformers.TrainingArguments(\n",
+    "        per_device_train_batch_size=4,\n",
+    "        gradient_accumulation_steps=4,\n",
+    "        warmup_steps=1000,\n",
+    "        max_steps=6000,\n",
+    "        learning_rate=2e-4,\n",
+    "        fp16=True,\n",
+    "        logging_steps=1,\n",
+    "        output_dir=\"checkpoints\",\n",
+    "        save_total_limit=3,\n",
+    "    ),\n",
+    "    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n",
+    ")\n",
+    "model.config.use_cache = False  # silence the warnings. Please re-enable for inference!\n",
+    "trainer.train()\n",
+    "\n",
+    "model.save_pretrained(\"outputs\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dl3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "90bfda469df5ac7fed8d7e225d563f60a7a7aa420ccfadb091c914debf775e49"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/lengths.ipynb
+++ b/lengths.ipynb