alpaca-lora/lengths.ipynb
2023-03-13 21:53:19 -07:00

158 lines
18 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/eric/miniconda3/envs/dl3/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"Found cached dataset json (/home/eric/.cache/huggingface/datasets/json/default-789f51900889f651/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
"100%|██████████| 1/1 [00:00<00:00, 742.35it/s]\n",
" \r"
]
}
],
"source": [
"from datasets import load_dataset\n",
"from transformers import LLaMATokenizer\n",
"\n",
"\n",
"tokenizer = LLaMATokenizer.from_pretrained(\"./7B/tokenizer\")\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"tokenizer.pad_token_id = tokenizer.eos_token_id\n",
"\n",
"\"\"\"\n",
"data = load_dataset(\"laion/OIG\", streaming=True)\n",
"data = data.map(\n",
" lambda x: tokenizer(\n",
" x[\"text\"],\n",
" ),\n",
")\n",
"\n",
"import itertools\n",
"\n",
"lens = [len(x[\"input_ids\"]) for x in itertools.islice(data[\"train\"], 1000)]\n",
"\"\"\"\n",
"\n",
"data = load_dataset(\"json\", data_files=\"alpaca_data.json\")\n",
"\n",
"\n",
"def generate_prompt(data_point):\n",
" # sorry about the formatting disaster gotta move fast\n",
" if data_point[\"instruction\"]:\n",
" return f\"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n",
"\n",
"### Instruction:\n",
"{data_point[\"instruction\"]}\n",
"\n",
"### Input:\n",
"{data_point[\"input\"]}\n",
"\n",
"### Response:\n",
"{data_point[\"output\"]}\"\"\"\n",
" else:\n",
" return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
"\n",
"### Instruction:\n",
"{data_point[\"instruction\"]}\n",
"\n",
"### Response:\n",
"{data_point[\"output\"]}\"\"\"\n",
"\n",
"\n",
"data = data.map(lambda data_point: {\"prompt\": tokenizer(generate_prompt(data_point))})"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([5.0967e+04, 8.9800e+02, 1.1400e+02, 8.0000e+00, 9.0000e+00,\n",
" 5.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00]),\n",
" array([ 56., 116., 176., 236., 296., 356., 416., 476., 536., 596., 656.]),\n",
" <BarContainer object of 10 artists>)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjkAAAGdCAYAAADwjmIIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAn+klEQVR4nO3df1DU953H8Rc/3AV/7BJ/AHLij45plPorgoFtkt55cm5T0qsN3qjnpdSYdvTQi5D6q/XQZHqHY+YazfirrdPgzMXzx81po0QsgxEvdeMPlAZspKYhhy1ZILWwShWU/d4fGb5xBRPxF/Lx+ZjZmbjf93757GdSfM5mv9+GWZZlCQAAwDDh3b0AAACAu4HIAQAARiJyAACAkYgcAABgJCIHAAAYicgBAABGInIAAICRiBwAAGCkyO5eQHcKBoOqra1Vv379FBYW1t3LAQAAN8GyLF24cEEJCQkKD7/x5zUPdOTU1tYqMTGxu5cBAABuwblz5zRkyJAbHn+gI6dfv36SPt0kl8vVzasBAAA3IxAIKDEx0f57/EYe6Mhp/09ULpeLyAEAoIf5oq+a8MVjAABgJCIHAAAYicgBAABGInIAAICRiBwAAGAkIgcAABiJyAEAAEYicgAAgJGIHAAAYCQiBwAAGInIAQAARiJyAACAkYgcAABgJCIHAAAYKbK7F2Cq4csKu3sJXfbR6ozuXgIAAHcMn+QAAAAjETkAAMBIXYqcVatWKSwsLOQxatQo+/jly5eVnZ2tAQMGqG/fvsrMzFRdXV3IOWpqapSRkaHevXsrNjZWixcv1tWrV0NmDh06pIkTJ8rpdGrkyJEqKCjosJYNGzZo+PDhioqKUmpqqo4dO9aVtwIAAAzX5U9yvvKVr+jjjz+2H++88459LCcnR3v37tWuXbtUWlqq2tpaPfPMM/bxtrY2ZWRkqLW1VUeOHNHWrVtVUFCgvLw8e6a6uloZGRmaPHmyysvLtWjRIj3//PM6cOCAPbNjxw7l5uZq5cqVOnnypMaPHy+v16v6+vpb3QcAAGCYMMuyrJsdXrVqlfbs2aPy8vIOx5qamjRo0CBt27ZN06dPlySdOXNGo0ePls/nU1pamvbv36+nn35atbW1iouLkyRt3rxZS5cuVUNDgxwOh5YuXarCwkJVVlba5545c6YaGxtVVFQkSUpNTdWkSZO0fv16SVIwGFRiYqIWLlyoZcuW3fSbDwQCcrvdampqksvluunX3Qy+eAwAwN1xs39/d/mTnLNnzyohIUFf+tKXNHv2bNXU1EiSysrKdOXKFaWnp9uzo0aN0tChQ+Xz+SRJPp9PY8eOtQNHkrxerwKBgE6fPm3PXHuO9pn2c7S2tqqsrCxkJjw8XOnp6fbMjbS0tCgQCIQ8AACAmboUOampqSooKFBRUZE2bdqk6upqPfnkk7pw4YL8fr8cDodiYmJCXhMXFye/3y9J8vv9IYHTfrz92OfNBAIBXbp0SZ988ona2to6nWk/x43k5+fL7Xbbj8TExK68fQAA0IN06T45Tz31lP3P48aNU2pqqoYNG6adO3cqOjr6ji/uTlu+fLlyc3PtPwcCAUIHAABD3dYl5DExMfryl7+sDz74QPHx8WptbVVjY2PITF1dneLj4yVJ8fHxHa62av/zF824XC5FR0dr4MCBioiI6HSm/Rw34nQ65XK5Qh4AAMBMtxU5Fy9e1O9//3sNHjxYycnJ6tWrl0pKSuzjVVVVqqmpkcfjkSR5PB5VVFSEXAVVXFwsl8ulpKQke+bac7TPtJ/D4XAoOTk5ZCYYDKqkpMSeAQAA6FLk/OAHP1Bpaak++ugjHTlyRN/+9rcVERGhWbNmye12a+7cucrNzdXbb7+tsrIyzZkzRx6PR2lpaZKkqVOnKikpSc8++6x+85vf6MCBA1qxYoWys7PldDolSfPmzdOHH36oJUuW6MyZM9q4caN27typnJwcex25ubn6+c9/rq1bt+r999/X/Pnz1dzcrDlz5tzBrQEAAD1Zl76T84c//EGzZs3Sn/70Jw0aNEhPPPGE3n33XQ0aNEiS9Oqrryo8PFyZmZlqaWmR1+vVxo0b7ddHRERo3759mj9/vjwej/r06aOsrCy9/PLL9syIESNUWFionJwcrVu3TkOGDNGWLVvk9XrtmRkzZqihoUF5eXny+/2aMGGCioqKOnwZGQAAPLi6dJ8c03CfnFDcJwcA0BPctfvkAAAA9AREDgAAMBKRAwAAjETkAAAAIxE5AADASEQOAAAwEpEDAACMROQAAAAjETkAAMBIRA4AADASkQMAAIxE5AAAACMROQAAwEhEDgAAMBKRAwAAjETkAAAAIxE5AADASEQOAAAwEpEDAACMROQAAAAjETkAAMBIRA4AADASkQMAAIxE5AAAACMROQAAwEhEDgAAMBKRAwAAjETkAAAAIxE5AADASEQOAAAwEpEDAACMROQAAAAjETkAAMBIRA4AADASkQMAAIxE5AAAACMROQAAwEhEDgAAMBKRAwAAjETkAAAAIxE5AADASEQOAAAwEpEDAACMROQAAAAjETkAAMBIRA4AADASkQMAAIxE5AAAACMROQAAwEhEDgAAMBKRAwAAjETkAAAAIxE5AADASEQOAAAwEpEDAACMROQAAAAjETkAAMBIRA4AADDSbUXO6tWrFRYWpkWLFtnPXb58WdnZ2RowYID69u2rzMxM1dXVhbyupqZGGRkZ6t27t2JjY7V48WJdvXo1ZObQoUOaOHGinE6nRo4cqYKCgg4/f8OGDRo+fLiioqKUmpqqY8eO3c7bAQAABrnlyDl+/Lh++tOfaty4cSHP5+TkaO/evdq1a5dKS0tVW1urZ555xj7e1tamjIwMtba26siRI9q6dasKCgqUl5dnz1RXVysjI0OTJ09WeXm5Fi1apOeff14HDhywZ3bs2KHc3FytXLlSJ0+e1Pjx4+X1elVfX3+rbwkAABgkzLIsq6svunjxoiZOnKiNGzfqxz/+sSZMmKC1a9eqqalJgwYN0rZt2zR9+nRJ0pkzZzR69Gj5fD6lpaVp//79evrpp1VbW6u4uDhJ0ubNm7V06VI1NDTI4XBo6dKlKiwsVGVlpf0zZ86cqcbGRhUVFUmSUlNTNWnSJK1fv16SFAwGlZiYqIULF2rZsmU39T4CgYDcbreamprkcrm6ug2fa/iywjt6vnvho9UZ3b0EAAC+0M3+/X1Ln+RkZ2crIyND6enpIc+XlZXpypUrIc+PGjVKQ4cOlc/nkyT5fD6NHTvWDhxJ8nq9CgQCOn36tD1z/bm9Xq99jtbWVpWVlYXMhIeHKz093Z7pTEtLiwKBQMgDAACYKbKrL9i+fbtOnjyp48ePdzjm9/vlcDgUExMT8nxcXJz8fr89c23gtB9vP/Z5M4FAQJcuXdKf//xntbW1dTpz5syZG649Pz9fL7300s29UQAA0KN16ZOcc+fO6YUXXtAbb7yhqKiou7Wmu2b58uVqamqyH+fOnevuJQEAgLukS5FTVlam+vp6TZw4UZGRkYqMjFRpaalee+01RUZGKi4uTq2trWpsbAx5XV1dneLj4yVJ8fHxHa62av/zF824XC5FR0dr4MCBioiI6HSm/RydcTqdcrlcIQ8AAGCmLkXOlClTVFFRofLycvuRkpKi2bNn2//cq1cvlZSU2K+pqqpSTU2NPB6PJMnj8aiioiLkKqji4mK5XC4lJSXZM9eeo32m/RwOh0PJyckhM8FgUCUlJfYMAAB4sHXpOzn9+vXTmDFjQp7r06ePBgwYYD8/d+5c5ebmqn///nK5XFq4cKE8Ho/S0tIkSVOnTlVSUpKeffZZrVmzRn6/XytWrFB2dracTqckad68eVq/fr2WLFmi5557TgcPHtTOnTtVWPjZFUu5ubnKyspSSkqKHnvsMa1du1bNzc2aM2fObW0IAAAwQ5e/ePxFXn31VYWHhyszM1MtLS3yer3auHGjfTwiIkL79u3T/Pnz5fF41KdPH2VlZenll1+2Z0aMGKHCwkLl5ORo3bp1GjJkiLZs2SKv12vPzJgxQw0NDcrLy5Pf79eECRNUVFTU4cvIAADgwXRL98kxBffJCcV9cgAAPcFdvU8OAADA/Y7IAQAARiJyAACAkYgcAABgJCIHAAAYicgBAABGInIAAICRiBwAAGAkIgcAABiJyAEAAEYicgAAgJGIHAAAYCQiBwAAGInIAQAARiJyAACAkYgcAABgJCIHAAAYicgBAABGInIAAICRiBwAAGAkIgcAABiJyAEAAEYicgAAgJGIHAAAYCQiBwAAGInIAQAARiJyAACAkYgcAABgJCIHAAAYicgBAABGInIAAICRiBwAAGAkIgcAABiJyAEAAEYicgAAgJGIHAAAYCQiBwAAGInIAQAARiJyAACAkYgcAABgJCIHAAAYicgBAABGInIAAICRiBwAAGAkIgcAABiJyAEAAEYicgAAgJGIHAAAYCQiBwAAGInIAQAARiJyAACAkYgcAABgJCIHAAAYicgBAABGInIAAICRiBwAAGAkIgcAABiJyAEAAEbqUuRs2rRJ48aNk8vlksvlksfj0f79++3jly9fVnZ2tgYMGKC+ffsqMzNTdXV1IeeoqalRRkaGevfurdjYWC1evFhXr14NmTl06JAmTpwop9OpkSNHqqCgoMNaNmzYoOHDhysqKkqpqak6duxYV94KAAAwXJciZ8iQIVq9erXKysp04sQJ/e3f/q2+9a1v6fTp05KknJwc7d27V7t27VJpaalqa2v1zDPP2K9va2tTRkaGWltbdeTIEW3dulUFBQXKy8uzZ6qrq5WRkaHJkyervLxcixYt0vPPP68DBw7YMzt27FBubq5WrlypkydPavz48fJ6vaqvr7/d/QAAAIYIsyzLup0T9O/fX6+88oqmT5+uQYMGadu2bZo+fbok6cyZMxo9erR8Pp/S0tK0f/9+Pf3006qtrVVcXJwkafPmzVq6dKkaGhrkcDi0dOlSFRYWqrKy0v4ZM2fOVGNjo4qKiiRJqampmjRpktavXy9JCgaDSkxM1MKFC7Vs2bKbXnsgEJDb7VZTU5NcLtftbEMHw5cV3tHz3Qsfrc7o7iUAAPCFbvbv71v+Tk5bW5u2b9+u5uZmeTwelZWV6cqVK0pPT7dnRo0apaFDh8rn80mSfD6fxo4daweOJHm9XgUCAfvTIJ/PF3KO9pn2c7S2tqqsrCxkJjw8XOnp6fbMjbS0tCgQCIQ8AACAmbocORUVFerbt6+cTqfmzZun3bt3KykpSX6/Xw6HQzExMSHzcXFx8vv9kiS/3x8SOO3H24993kwgENClS5f0ySefqK2trdOZ9nPcSH5+vtxut/1ITEzs6tsHAAA9RJcj55FHHlF5ebmOHj2q+fPnKysrS7/97W/vxtruuOXLl6upqcl+nDt3rruXBAAA7pLIrr7A4XBo5MiRkqTk5GQdP35c69at04wZM9Ta2qrGxsaQT3Pq6uoUHx8vSYqPj+9wFVT71VfXzlx/RVZdXZ1cLpeio6MVERGhiIiITmfaz3EjTqdTTqezq28ZAAD0QLd9n5xgMKiWlhYlJyerV69eKikpsY9VVVWppqZGHo9HkuTxeFRRURFyFVRxcbFcLpeSkpLsmWvP0T7Tfg6Hw6Hk5OSQmWAwqJKSEnsGAACgS5/kLF++XE899ZSGDh2qCxcuaNu2bTp06JAOHDggt9utuXPnKjc3V/3795fL5dLChQvl8XiUlpYmSZo6daqSkpL07LPPas2aNfL7/VqxYoWys7PtT1jmzZun9evXa8mSJXruued08OBB7dy5U4WFn12tlJubq6ysLKWkpOixxx7T2rVr1dzcrDlz5tzBrQEAAD1ZlyKnvr5e3/nOd/Txxx/L7XZr3LhxOnDggP7u7/5OkvTqq68qPDxcmZmZamlpkdfr1caNG+3XR0REaN++fZo/f748Ho/69OmjrKwsvfzyy/bMiBEjVFhYqJycHK1bt05DhgzRli1b5PV67ZkZM2aooaFBeXl58vv9mjBhgoqKijp8GRkAADy4bvs+OT0Z98kJxX1yAAA9wV2/Tw4AAMD9jMgBAABGInIAAICRiBwAAGAkIgcAABiJyAEAAEYicgAAgJGIHAAAYCQiBwAAGInIAQAARiJyAACAkYgcAABgJCIHAAAYicgBAABGInIAAICRiBwAAGAkIgcAABiJyAEAAEYicgAAgJGIHAAAYCQiBwAAGInIAQAARiJyAACAkYgcAABgJCIHAAAYicgBAABGInIAAICRiBwAAGAkIgcAABiJyAEAAEYicgAAgJGIHAAAYCQiBwAAGInIAQAARiJyAACAkYgcAABgJCIHAAAYicgBAABGInIAAICRiBwAAGAkIgcAABiJyAEAAEYicgAAgJGIHAAAYCQiBwAAGInIAQAARiJyAACAkYgcAABgJCIHAAAYicgBAABGInIAAICRiBwAAGAkIgcAABiJyAEAAEYicgAAgJGIHAAAYCQiBwAAGInIAQAARupS5OTn52vSpEnq16+fYmNjNW3aNFVVVYXMXL58WdnZ2RowYID69u2rzMxM1dXVhczU1NQoIyNDvXv3VmxsrBYvXqyrV6+GzBw6dEgTJ06U0+nUyJEjVVBQ0GE9GzZs0PDhwxUVFaXU1FQdO3asK28HAAAYrEuRU1paquzsbL377rsqLi7WlStXNHXqVDU3N9szOTk52rt3r3bt2qXS0lLV1tbqmWeesY+3tbUpIyNDra2tOnLkiLZu3aqCggLl5eXZM9XV1crIyNDkyZNVXl6uRYsW6fnnn9eBAwfsmR07dig3N1crV67UyZMnNX78eHm9XtXX19/OfgAAAEOEWZZl3eqLGxoaFBsbq9LSUn3ta19TU1OTBg0apG3btmn69OmSpDNnzmj06NHy+XxKS0vT/v379fTTT6u2tlZxcXGSpM2bN2vp0qVqaGiQw+HQ0qVLVVhYqMrKSvtnzZw5U42NjSoqKpIkpaamatKkSVq/fr0kKRgMKjExUQsXLtSyZctuav2BQEBut1tNTU1yuVy3ug2dGr6s8I6e7174aHVGdy8BAIAvdLN/f9/Wd3KampokSf3795cklZWV6cqVK0pPT7dnRo0apaFDh8rn80mSfD6fxo4daweOJHm9XgUCAZ0+fdqeufYc7TPt52htbVVZWVnITHh4uNLT0+2ZzrS0tCgQCIQ8AACAmW45coLBoBYtWqTHH39cY8aMkST5/X45HA7FxMSEzMbFxcnv99sz1wZO+/H2Y583EwgEdOnSJX3yySdqa2vrdKb9HJ3Jz8+X2+22H4mJiV1/4wAAoEe45cjJzs5WZWWltm/ffifXc1ctX75cTU1N9uPcuXPdvSQAAHCXRN7KixYsWKB9+/bp8OHDGjJkiP18fHy8Wltb1djYGPJpTl1dneLj4+2Z66+Car/66tqZ66/Iqqurk8vlUnR0tCIiIhQREdHpTPs5OuN0OuV0Orv+hgEAQI/TpU9yLMvSggULtHv3bh08eFAjRowIOZ6cnKxevXqppKTEfq6qqko1NTXyeDySJI/Ho4qKipCroIqLi+VyuZSUlGTPXHuO9pn2czgcDiUnJ4fMBINBlZSU2DMAAODB1qVPcrKzs7Vt2zb98pe/VL9+/ezvv7jdbkVHR8vtdmvu3LnKzc1V//795XK5tHDhQnk8HqWlpUmSpk6dqqSkJD377LNas2aN/H6/VqxYoezsbPtTlnnz5mn9+vVasmSJnnvuOR08eFA7d+5UYeFnVyzl5uYqKytLKSkpeuyxx7R27Vo1Nzdrzpw5d2pvAABAD9alyNm0aZMk6W/+5m9Cnn/99df13e9+V5L06quvKjw8XJmZmWppaZHX69XGjRvt2YiICO3bt0/z58+Xx+NRnz59lJWVpZdfftmeGTFihAoLC5WTk6N169ZpyJAh2rJli7xerz0zY8YMNTQ0KC8vT36/XxMmTFBRUVGHLyMDAIAH023dJ6en4z45obhPDgCgJ7gn98kBAAC4XxE5AADASEQOAAAwEpEDAACMROQAAAAjETkAAMBIRA4AADASkQMAAIxE5AAAACMROQAAwEhEDgAAMBKRAwAAjETkAAAAIxE5AADASEQOAAAwEpEDAACMROQAAAAjETkAAMBIRA4AADASkQMAAIxE5AAAACMROQAAwEhEDgAAMBKRAwAAjETkAAAAIxE5AADASEQOAAAwEpEDAACMROQAAAAjETkAAMBIRA4AADASkQMAAIxE5AAAACMROQAAwEhEDgAAMBKRAwAAjETkAAAAIxE5AADASEQOAAAwEpEDAACMROQAAAAjETkAAMBIRA4AADASkQMAAIxE5AAAACMROQAAwEhEDgAAMBKRAwAAjETkAAAAIxE5AADASEQOAAAwEpEDAACMROQAAAAjETkAAMBIRA4AADASkQMAAIxE5AAAACMROQAAwEhdjpzDhw/rm9/8phISEhQWFqY9e/aEHLcsS3l5eRo8eLCio6OVnp6us2fPhsycP39es2fPlsvlUkxMjObOnauLFy+GzLz33nt68sknFRUVpcTERK1Zs6bDWnbt2qVRo0YpKipKY8eO1VtvvdXVtwMAAAzV5chpbm7W+PHjtWHDhk6Pr1mzRq+99po2b96so0ePqk+fPvJ6vbp8+bI9M3v2bJ0+fVrFxcXat2+fDh8+rO9///v28UAgoKlTp2rYsGEqKyvTK6+8olWrVulnP/uZPXPkyBHNmjVLc+fO1alTpzRt2jRNmzZNlZWVXX1LAADAQGGWZVm3/OKwMO3evVvTpk2T9OmnOAkJCXrxxRf1gx/8QJLU1NSkuLg4FRQUaObMmXr//feVlJSk48ePKyUlRZJUVFSkb3zjG/rDH/6ghIQEbdq0ST/60Y/k9/vlcDgkScuWLdOePXt05swZSdKMGTPU3Nysffv22etJS0vThAkTtHnz5ptafyAQkNvtVlNTk1wu161uQ6eGLyu8o+e7Fz5andHdSwAA4Avd7N/fd/Q7OdXV1fL7/UpPT7efc7vdSk1Nlc/nkyT5fD7FxMTYgSNJ6enpCg8P19GjR+2Zr33ta3bgSJLX61VVVZX+/Oc/2zPX/pz2mfaf05mWlhYFAoGQBwAAMNMdjRy/3y9JiouLC3k+Li7OPub3+xUbGxtyPDIyUv379w+Z6ewc1/6MG820H+9Mfn6+3G63/UhMTOzqWwQAAD3EA3V11fLly9XU1GQ/zp07191LAgAAd8kdjZz4+HhJUl1dXcjzdXV19rH4+HjV19eHHL969arOnz8fMtPZOa79GTeaaT/eGafTKZfLFfIAAABmuqORM2LECMXHx6ukpMR+LhAI6OjRo/J4PJIkj8ejxsZGlZWV2TMHDx5UMBhUamqqPXP48GFduXLFnikuLtYjjzyihx56yJ659ue0z7T/HAAA8GDrcuRcvHhR5eXlKi8vl/Tpl43Ly8tVU1OjsLAwLVq0SD/+8Y/15ptvqqKiQt/5zneUkJBgX4E1evRoff3rX9f3vvc9HTt2TL/+9a+1YMECzZw5UwkJCZKkf/zHf5TD4dDcuXN1+vRp7dixQ+vWrVNubq69jhdeeEFFRUX6j//4D505c0arVq3SiRMntGDBgtvfFQAA0ONFdvUFJ06c0OTJk+0/t4dHVlaWCgoKtGTJEjU3N+v73/++Ghsb9cQTT6ioqEhRUVH2a9544w0tWLBAU6ZMUXh4uDIzM/Xaa6/Zx91ut371q18pOztbycnJGjhwoPLy8kLupfPVr35V27Zt04oVK/TDH/5QDz/8sPbs2aMxY8bc0kYAAACz3NZ9cno67pMTivvkAAB6gm65Tw4AAMD9gsgBAABGInIAAICRiBwAAGAkIgcAABiJyAEAAEYicgAAgJGIHAAAYCQiBwAAGInIAQAARiJyAACAkYgcAABgJCIHAAAYicgBAABGInIAAICRiBwAAGAkIgcAABiJyAEAAEYicgAAgJGIHAAAYCQiBwAAGInIAQAARiJyAACAkYgcAABgJCIHAAAYicgBAABGInIAAICRiBwAAGAkIgcAABiJyAEAAEYicgAAgJGIHAAAYCQiBwAAGInIAQAARiJyAACAkYgcAABgJCIHAAAYicgBAABGInIAAICRiBwAAGAkIgcAABiJyAEAAEYicgAAgJGIHAAAYCQiBwAAGInIAQAARiJyAACAkYgcAABgJCIHAAAYicgBAABGInIAAICRiBwAAGAkIgcAABiJyAEAAEYicgAAgJGIHAAAYKTI7l4A7h/DlxV29xK67KPVGd29BADAfYpPcgAAgJF6fORs2LBBw4cPV1RUlFJTU3Xs2LHuXhIAALgP9OjI2bFjh3Jzc7Vy5UqdPHlS48ePl9frVX19fXcvDQAAdLMeHTk/+clP9L3vfU9z5sxRUlKSNm/erN69e+sXv/hFdy8NAAB0sx77xePW1laVlZVp+fLl9nPh4eFKT0+Xz+fr9DUtLS1qaWmx/9zU1CRJCgQCd3x9wZa/3PFzoqOhObu6ewldVvmSt7uXAAA9Wvvf25Zlfe5cj42cTz75RG1tbYqLiwt5Pi4uTmfOnOn0Nfn5+XrppZc6PJ+YmHhX1gh0xr22u1cAAGa4cOGC3G73DY/32Mi5FcuXL1dubq7952AwqPPnz2vAgAEKCwvrxpXdeYFAQImJiTp37pxcLld3L+e+xl51Dft189irm8de3Tz26tNPcC5cuKCEhITPneuxkTNw4EBFRESorq4u5Pm6ujrFx8d3+hqn0ymn0xnyXExMzN1a4n3B5XI9sP8j6Cr2qmvYr5vHXt089urmPeh79Xmf4LTrsV88djgcSk5OVklJif1cMBhUSUmJPB5PN64MAADcD3rsJzmSlJubq6ysLKWkpOixxx7T2rVr1dzcrDlz5nT30gAAQDfr0ZEzY8YMNTQ0KC8vT36/XxMmTFBRUVGHLyM/iJxOp1auXNnhP8+hI/aqa9ivm8de3Tz26uaxVzcvzPqi668AAAB6oB77nRwAAIDPQ+QAAAAjETkAAMBIRA4AADASkdPDHD58WN/85jeVkJCgsLAw7dmzJ+S4ZVnKy8vT4MGDFR0drfT0dJ09ezZk5vz585o9e7ZcLpdiYmI0d+5cXbx48R6+i7svPz9fkyZNUr9+/RQbG6tp06apqqoqZOby5cvKzs7WgAED1LdvX2VmZna4uWRNTY0yMjLUu3dvxcbGavHixbp69eq9fCt33aZNmzRu3Dj7xmIej0f79++3j7NPN7Z69WqFhYVp0aJF9nPs12dWrVqlsLCwkMeoUaPs4+xVqD/+8Y/6p3/6Jw0YMEDR0dEaO3asTpw4YR/n9/stsNCjvPXWW9aPfvQj63/+538sSdbu3btDjq9evdpyu93Wnj17rN/85jfW3//931sjRoywLl26ZM98/etft8aPH2+9++671v/+7/9aI0eOtGbNmnWP38nd5fV6rddff92qrKy0ysvLrW984xvW0KFDrYsXL9oz8+bNsxITE62SkhLrxIkTVlpamvXVr37VPn716lVrzJgxVnp6unXq1CnrrbfesgYOHGgtX768O97SXfPmm29ahYWF1u9+9zurqqrK+uEPf2j16tXLqqystCyLfbqRY8eOWcOHD7fGjRtnvfDCC/bz7NdnVq5caX3lK1+xPv74Y/vR0NBgH2evPnP+/Hlr2LBh1ne/+13r6NGj1ocffmgdOHDA+uCDD+wZfr93HZHTg10fOcFg0IqPj7deeeUV+7nGxkbL6XRa//Vf/2VZlmX99re/tSRZx48ft2f2799vhYWFWX/84x/v2drvtfr6ekuSVVpaalnWp/vSq1cva9euXfbM+++/b0myfD6fZVmfBmV4eLjl9/vtmU2bNlkul8tqaWm5t2/gHnvooYesLVu2sE83cOHCBevhhx+2iouLrb/+67+2I4f9CrVy5Upr/PjxnR5jr0ItXbrUeuKJJ254nN/vt4b/XGWQ6upq+f1+paen28+53W6lpqbK5/NJknw+n2JiYpSSkmLPpKenKzw8XEePHr3na75XmpqaJEn9+/eXJJWVlenKlSshezVq1CgNHTo0ZK/Gjh0bcnNJr9erQCCg06dP38PV3zttbW3avn27mpub5fF42KcbyM7OVkZGRsi+SPx71ZmzZ88qISFBX/rSlzR79mzV1NRIYq+u9+abbyolJUX/8A//oNjYWD366KP6+c9/bh/n9/utIXIM4vf7JanDHZ/j4uLsY36/X7GxsSHHIyMj1b9/f3vGNMFgUIsWLdLjjz+uMWPGSPp0HxwOR4f/g9br96qzvWw/ZpKKigr17dtXTqdT8+bN0+7du5WUlMQ+dWL79u06efKk8vPzOxxjv0KlpqaqoKBARUVF2rRpk6qrq/Xkk0/qwoUL7NV1PvzwQ23atEkPP/ywDhw4oPnz5+tf/uVftHXrVkn8fr9VPfr/1gG4GdnZ2aqsrNQ777zT3Uu5bz3yyCMqLy9XU1OT/vu//1tZWVkqLS3t7mXdd86dO6cXXnhBxcXFioqK6u7l3Peeeuop+5/HjRun1NRUDRs2TDt37lR0dHQ3ruz+EwwGlZKSon//93+XJD366KOqrKzU5s2blZWV1c2r67n4JMcg8fHxktTh6oS6ujr7WHx8vOrr60OOX716VefPn7dnTLJgwQLt27dPb7/9toYMGWI/Hx8fr9bWVjU2NobMX79Xne1l+zGTOBwOjRw5UsnJycrPz9f48eO1bt069uk6ZWVlqq+v18SJExUZGanIyEiVlpbqtddeU2RkpOLi4tivzxETE6Mvf/nL+uCDD/h36zqDBw9WUlJSyHOjR4+2//Mev99vDZFjkBEjRig+Pl4lJSX2c4FAQEePHpXH45EkeTweNTY2qqyszJ45ePCggsGgUlNT7/ma7xbLsrRgwQLt3r1bBw8e1IgRI0KOJycnq1evXiF7VVVVpZqampC9qqioCPmlUVxcLJfL1eGXkWmCwaBaWlrYp+tMmTJFFRUVKi8vtx8pKSmaPXu2/c/s141dvHhRv//97zV48GD+3brO448/3uE2F7/73e80bNgwSfx+v2Xd/c1ndM2FCxesU6dOWadOnbIkWT/5yU+sU6dOWf/3f/9nWdanlxjGxMRYv/zlL6333nvP+ta3vtXpJYaPPvqodfToUeudd96xHn74YeMuMZw/f77ldrutQ4cOhVy++pe//MWemTdvnjV06FDr4MGD1okTJyyPx2N5PB77ePvlq1OnTrXKy8utoqIia9CgQcZdvrps2TKrtLTUqq6utt577z1r2bJlVlhYmPWrX/3Ksiz26Ytce3WVZbFf13rxxRetQ4cOWdXV1davf/1rKz093Ro4cKBVX19vWRZ7da1jx45ZkZGR1r/9279ZZ8+etd544w2rd+/e1n/+53/aM/x+7zoip4d5++23LUkdHllZWZZlfXqZ4b/+679acXFxltPptKZMmWJVVVWFnONPf/qTNWvWLKtv376Wy+Wy5syZY124cKEb3s3d09keSbJef/11e+bSpUvWP//zP1sPPfSQ1bt3b+vb3/629fHHH4ec56OPPrKeeuopKzo62ho4cKD14osvWleuXLnH7+bueu6556xhw4ZZDofDGjRokDVlyhQ7cCyLffoi10cO+/WZGTNmWIMHD7YcDof1V3/1V9aMGTNC7vvCXoXau3evNWbMGMvpdFqjRo2yfvazn4Uc5/d714VZlmV1z2dIAAAAdw/fyQEAAEYicgAAgJGIHAAAYCQiBwAAGInIAQAARiJyAACAkYgcAABgJCIHAAAYicgBAABGInIAAICRiBwAAGAkIgcAABjp/wE2hg/VeTd1IwAAAABJRU5ErkJggg==",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"lens = [len(x[\"prompt\"][\"input_ids\"]) for x in data[\"train\"]]\n",
"plt.hist(lens)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"634"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len([l for l in lens if l > 128])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "dl3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}