2023-03-13 17:34:26 -04:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/eric/miniconda3/envs/dl3/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"Found cached dataset json (/home/eric/.cache/huggingface/datasets/json/default-789f51900889f651/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
2023-03-15 14:11:01 -04:00
"100%|██████████| 1/1 [00:00<00:00, 784.28it/s]\n",
"Loading cached processed dataset at /home/eric/.cache/huggingface/datasets/json/default-789f51900889f651/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-f691ee34ec2034cb.arrow\n"
2023-03-13 17:34:26 -04:00
]
}
],
"source": [
"from datasets import load_dataset\n",
2023-03-16 10:34:33 -04:00
"from transformers import LlamaTokenizer\n",
2023-03-13 17:34:26 -04:00
"\n",
"\n",
2023-03-16 10:34:33 -04:00
"tokenizer = LlamaTokenizer.from_pretrained(\"decapoda-research/llama-7b-hf\", add_eos_token=True)\n",
2023-03-13 17:34:26 -04:00
"tokenizer.pad_token = tokenizer.eos_token\n",
"tokenizer.pad_token_id = tokenizer.eos_token_id\n",
"\n",
"data = load_dataset(\"json\", data_files=\"alpaca_data.json\")\n",
"\n",
"\n",
"def generate_prompt(data_point):\n",
" # sorry about the formatting disaster gotta move fast\n",
2023-03-15 14:11:01 -04:00
" if data_point[\"input\"]:\n",
2023-03-13 17:34:26 -04:00
" return f\"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n",
"\n",
"### Instruction:\n",
"{data_point[\"instruction\"]}\n",
"\n",
"### Input:\n",
"{data_point[\"input\"]}\n",
"\n",
2023-03-14 00:53:03 -04:00
"### Response:\n",
"{data_point[\"output\"]}\"\"\"\n",
2023-03-13 17:34:26 -04:00
" else:\n",
" return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
"\n",
"### Instruction:\n",
"{data_point[\"instruction\"]}\n",
"\n",
2023-03-14 00:53:03 -04:00
"### Response:\n",
"{data_point[\"output\"]}\"\"\"\n",
2023-03-13 17:34:26 -04:00
"\n",
"\n",
"data = data.map(lambda data_point: {\"prompt\": tokenizer(generate_prompt(data_point))})"
]
},
{
"cell_type": "code",
2023-03-14 01:38:22 -04:00
"execution_count": 2,
2023-03-13 17:34:26 -04:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
2023-03-15 14:11:01 -04:00
"<matplotlib.lines.Line2D at 0x7f6f1af20af0>"
2023-03-13 17:34:26 -04:00
]
},
2023-03-14 01:38:22 -04:00
"execution_count": 2,
2023-03-13 17:34:26 -04:00
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
2023-03-15 14:11:01 -04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjAAAAGzCAYAAAAxPS2EAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA/q0lEQVR4nO3de1hVZf7//xeobBDcG0+AJCqTjkoeSizd09Ekyeio1mhmaJpfDSu0MbUaM5vS0UrtoFbOiJ/ppHaVleSBxMOUpEZRamk6YTgZWBls8QAK9++PfqxxCyoQCAuej+va18W+13uvfa8bkJf3WvfaPsYYIwAAABvxrekOAAAAVBQBBgAA2A4BBgAA2A4BBgAA2A4BBgAA2A4BBgAA2A4BBgAA2A4BBgAA2A4BBgAA2A4BBvXWtGnT5OPjc17e65prrtE111xjPd+wYYN8fHz09ttvn5f3Hz58uNq1a3de3quy8vPzNWrUKIWFhcnHx0eJiYk13aVareRnaMOGDTXdlXPy8fHRuHHjarobqGMIMKgTkpKS5OPjYz38/f0VHh6u2NhYPf/88zp8+HCVvM+BAwc0bdo0ZWRkVMn+qlJt7lt5PP3000pKStLYsWP1r3/9S8OGDavpLp1XR48e1bRp02wRSMqyefNmTZs2Tbm5uTXdFdQTDWu6A0BVmj59uiIjI3XixAllZ2drw4YNSkxM1HPPPaf3339f3bp1s2ofe+wxTZ48uUL7P3DggJ544gm1a9dOF198cblft3bt2gq9T2WcrW+vvvqqiouLq70Pv0dqaqp69+6txx9/vKa7UiOOHj2qJ554QpK8ZuvsYvPmzXriiSc0fPhwBQcH13R3UA8QYFCn9O/fXz179rSeT5kyRampqbrxxht1880365tvvlFAQIAkqWHDhmrYsHp/BY4eParGjRvLz8+vWt/nXBo1alSj718eBw8eVFRUVLW/z5EjRxQYGFjt7wOgenEKCXXetddeq7/+9a/6/vvv9dprr1ntZV0Dk5KSoiuuuELBwcEKCgpSx44d9cgjj0j67ZqDSy+9VJI0YsQI63RVUlKSpN/+19ylSxelp6frqquuUuPGja3Xnn4NTImioiI98sgjCgsLU2BgoG6++Wbt37/fq6Zdu3YaPnx4qdeeus9z9a2sa2COHDmihx56SBEREXI4HOrYsaOeeeYZnf4B9SXXL6xYsUJdunSRw+HQRRddpNWrV5c94Kc5ePCgRo4cqdDQUPn7+6t79+5asmSJtb3kWo7MzEwlJydbfd+3b98Z91nSp9dff10dO3aUv7+/oqOjtWnTJq+6ku/x119/rTvvvFNNmzbVFVdcIUk6efKknnzySV144YVyOBxq166dHnnkERUUFHjto127drrxxhu1YcMG9ezZUwEBAeratat1quedd95R165drT588cUXXq8fPny4goKC9N133yk2NlaBgYEKDw/X9OnTrbHet2+fWrZsKUl64oknrDGYNm1aucb4VFu2bNH1118vl8ulxo0b6+qrr9Ynn3xS5rjs3bvXmjFxuVwaMWKEjh496lV77NgxPfDAA2rRooWaNGmim2++WT/88INX/6ZNm6aJEydKkiIjI8/4PTzXz9Dhw4eVmJiodu3ayeFwKCQkRNddd50+//zzCo8D6j5mYFAvDBs2TI888ojWrl2re++9t8yanTt36sYbb1S3bt00ffp0ORwO7d271/rHv3Pnzpo+fbqmTp2q0aNH68orr5Qk/elPf7L28csvv6h///4aPHiw7rrrLoWGhp61X0899ZR8fHw0adIkHTx4UHPnzlVMTIwyMjKsmaLyKE/fTmWM0c0336z169dr5MiRuvjii7VmzRpNnDhRP/zwg+bMmeNV//HHH+udd97RfffdpyZNmuj555/XwIEDlZWVpebNm5+xX8eOHdM111yjvXv3aty4cYqMjNTy5cs1fPhw5ebm6sEHH1Tnzp31r3/9S+PHj1fr1q310EMPSZL1B/1MNm7cqKVLl+qBBx6Qw+HQ/Pnzdf3112vr1q3q0qWLV+3tt9+uDh066Omnn7ZCw6hRo7RkyRINGjRIDz30kLZs2aIZM2bom2++0bvvvuv1+r179+rOO+/U//t//0933XWXnnnmGd10001auHChHnnkEd13332SpBkzZuiOO+7Q7t275ev7v/8fFhUV6frrr1fv3r01a9YsrV69Wo8//rhOnjyp6dOnq2XLllqwYIHGjh2r2267TQMGDJAkr1Oe5ZGamqr+/fsrOjpajz/+uHx9fbV48WJde+21+ve//63LLrvMq/6OO+5QZGSkZsyYoc8//1yLFi1SSEiI/v73v1s1w4cP17JlyzRs2DD17t1bGzduVFxcnNd+BgwYoG+//VZvvvmm5syZoxYtWkjy/h6W52dozJgxevvttzVu3DhFRUXpl19+0ccff6xvvvlGPXr0qNBYoB4wQB2wePFiI8ls27btjDUul8tccskl1vPHH3/cnPorMGfOHCPJ/PTTT2fcx7Zt24wks3jx4lLbrr76aiPJLFy4sMxtV199tfV8/fr1RpK54IILjMfjsdqXLVtmJJl58+ZZbW3btjXx8fHn3OfZ+hYfH2/atm1rPV+xYoWRZP72t7951Q0aNMj4+PiYvXv3Wm2SjJ+fn1fbl19+aSSZF154odR7nWru3LlGknnttdestsLCQuN2u01QUJDXsbdt29bExcWddX+n9kmS+eyzz6y277//3vj7+5vbbrvNaiv5Hg8ZMsTr9RkZGUaSGTVqlFf7X/7yFyPJpKamevVLktm8ebPVtmbNGiPJBAQEmO+//95qf/nll40ks379eqstPj7eSDL333+/1VZcXGzi4uKMn5+f9fP2008/GUnm8ccfL9cYlPwMlbxXcXGx6dChg4mNjTXFxcVW3dGjR01kZKS57rrrSo3LPffc47XP2267zTRv3tx6np6ebiSZxMREr7rhw4eX6uvs2bONJJOZmVmqr+X9GXK5XCYhIaFcxw9wCgn1RlBQ0FlXI5VcePjee+9V+oJXh8OhESNGlLv+7rvvVpMmTazngwYNUqtWrfThhx9W6v3L68MPP1SDBg30wAMPeLU/9NBDMsZo1apVXu0xMTG68MILrefdunWT0+nUd999d873CQsL05AhQ6y2Ro0a6YEHHlB+fr42btxY6WNwu92Kjo62nrdp00a33HKL1qxZo6KiIq/aMWPGlOqXJE2YMMGrvWT2Jzk52as9KipKbrfbet6rVy9Jv52ebNOmTan2ssbl1GXEJafACgsL9dFHH53jSMsnIyNDe/bs0Z133qlffvlFP//8s37++WcdOXJEffv21aZNm0r9XJ8+LldeeaV++eUXeTweSbJO8ZTMMJW4//77K9y/8vwMBQcHa8uWLTpw4ECF94/6hwCDeiM/P98rLJzuz3/+sy6//HKNGjVKoaGhGjx4sJYtW1ahMHPBBRdU6ILdDh06eD338fFR+/btz3r9R1X4/vvvFR4eXmo8OnfubG0/1al/pEs0bdpUv/766znfp0OHDl6nU872PhVx+thJ0h//+EcdPXpUP/30k1d7ZGRkqX75+vqqffv2Xu1hYWEKDg4+5/G7XC5JUkRERJntp4+Lr6+v/vCHP5Tqq6Qq+17v2bNHkhQfH6+WLVt6PRYtWqSCggLl5eV5veb042ratKlX/0vG6fTxO33cyqM8P0OzZs3Sjh07FBERocsuu0zTpk07Z0hG/cU1MKgX/vvf/yovL++s//AGBARo06ZNWr9+vZKTk7V69WotXbpU1157rdauXasGDRqc830qct1KeZ3pZntFRUXl6lNVONP7mNMu+K2tzvR9Ke+NDM90/LVpXEqC9uzZs8+4xD8oKMjr+fnsf3ne64477tCVV16pd999V2vXrtXs2bP197//Xe+884769+9f5X2CvTEDg3rhX//6lyQpNjb2rHW+vr7q27evnnvuOX399dd66qmnlJqaqvXr10sq/x+88ir5X3MJY4z27t3rtWKoadOmZd4c7PRZgor0rW3btjpw4ECpU2q7du2ytleFtm3bas+ePaVmsarifU4fO0n
2023-03-13 17:34:26 -04:00
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"lens = [len(x[\"prompt\"][\"input_ids\"]) for x in data[\"train\"]]\n",
2023-03-15 14:11:01 -04:00
"plt.hist(lens, bins=100)\n",
"plt.title(\"Distribution of prompt lengths\")\n",
"plt.axvline(256, color=\"red\")"
2023-03-13 17:34:26 -04:00
]
},
{
"cell_type": "code",
2023-03-14 01:38:22 -04:00
"execution_count": 3,
2023-03-13 17:34:26 -04:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
2023-03-15 14:11:01 -04:00
"<matplotlib.lines.Line2D at 0x7f6eef316ce0>"
2023-03-13 17:34:26 -04:00
]
},
2023-03-14 01:38:22 -04:00
"execution_count": 3,
2023-03-13 17:34:26 -04:00
"metadata": {},
"output_type": "execute_result"
2023-03-14 01:38:22 -04:00
},
{
"data": {
2023-03-15 14:11:01 -04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkUAAAGzCAYAAAAhXWNYAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAABagUlEQVR4nO3dd3xT9f4/8FeSNulMB12UllVG2WhZdeGoVOhVUVRElMpQ8RZZ9wLiAHHB1auCyhD5fcWLKMPNEMQC4iirUNnIKJu2lNKkLZ3J+/dHyaGhBTrShiSv5+ORB805n5zzOZ/kJC/O+XzOUYmIgIiIiMjFqe1dASIiIqIbAUMRERERERiKiIiIiAAwFBEREREBYCgiIiIiAsBQRERERASAoYiIiIgIAEMREREREQCGIiIiIiIADEUNbuPGjVCpVPj666/tXZVqyczMxCOPPIJGjRpBpVJh5syZVy2bn5+PESNGICwsDCqVCmPHjq3Rup5++mk0b97cappKpcJrr71W43rTZZbP3MaNG+1dFbur6jPm6g4dOoQ+ffrAz88PKpUK33//vb2rVKU777wTd955p72rcU01+b50BK+99hpUKpW9q3FdtvxddbNBfW44CxcuxNChQ6HT6XDkyBE0adLEav6dd96J7Oxs7Nmzx041dBzjxo3D2rVrMXXqVISFhaFbt25XLfv2229j4cKFePXVVxEVFYV27do1YE2JqDYSExORnp6Ot956C/7+/tfcx+vbvn37sGzZMocNrzX5vqSa+/LLL5GVlVXj/3DXhFOGIovi4mLMmDEDH330kb2r4rDWr1+PBx98EP/+97+rVbZXr16YOnVqA9SMiOqqsLAQKSkpePnllzFq1Ch7Vwf79u3DtGnTcOedd1YKRT///LN9KlUDNfm+pJr78ssvsWfPnnoNRU59+qxr16749NNPcebMGXtXpcEVFBTYZDlZWVnw9/e3eVm6PrPZjKKiIntXg5zYuXPnAMAh9lutVgutVmvvalwTvwMdn1OHopdeegkmkwkzZsy4Zrljx45BpVJh4cKFleZd2afFco7177//xpNPPgk/Pz8EBwfj1VdfhYjg5MmTePDBB6HX6xEWFob33nuvynWaTCa89NJLCAsLg7e3Nx544AGcPHmyUrktW7bgvvvug5+fH7y8vNC7d2/88ccfVmUsddq3bx+eeOIJBAQE4LbbbrvmNh89ehSPPvooAgMD4eXlhV69emHVqlXK/IULF0KlUkFEMHv2bKhUqqueW7acz01PT8eqVauUsseOHVOWc+zYsSpfU5N+Lhs2bIBKpcJ3331Xad6XX34JlUqFlJSUay4jNzcX48aNQ/PmzaHT6RAREYEhQ4YgOztbKZOVlYXhw4cjNDQUHh4e6NKlCz7//HNlfmlpKQIDAzF06NBKyzcajfDw8LD6n2JxcTGmTp2KVq1aQafTITIyEhMnTkRxcbHVa1UqFUaNGoXFixejQ4cO0Ol0WLNmDQDg9OnTGDZsGEJDQ6HT6dChQwf83//9X6X1nzp1Cv3794e3tzdCQkIwbty4Suu5luutp7CwENHR0YiOjkZhYaEyPScnB40bN8Ytt9wCk8kEANi1axeefvpptGzZEh4eHggLC8OwYcNw/vx5q3XWdZ+yfJaWLl1arX3qSmazGTNnzkSHDh3g4eGB0NBQPPfcc7hw4YJVue3btyM+Ph5BQUHw9PREixYtMGzYsOsu/4cffkBCQgLCw8Oh0+kQFRWFN954Q2kni0OHDmHAgAEICwuDh4cHIiIi8Pjjj8NgMFxz+b/99hseffRRNG3aVPl8jRs3zur9qcprr72GZs2aAQAmTJgAlUqlHJ252umrqvqYWD6333//PTp27Kh8biyf3YpOnz6N4cOHK23RokULPP/88ygpKcHChQvx6KOPAgDuuusu5XvE8h1RVZ+i6+2rwOXv9//+97+YP38+oqKioNPp0L17d2zbtu2abWRhy+/LK+s0e/ZstGzZEl5eXujTpw9OnjwJEcEbb7yBiIgIeHp64sEHH0ROTo7VMqrzudq/fz88PT0xZMgQq9f+/vvv0Gg0mDRpUrW2/0pffPEFYmJi4OnpicDAQDz++OOV9rU777wTHTt2xL59+3DXXXfBy8sLTZo0wTvvvFNpecePH8cDDzxg9b21du3aSu//qlWrcPz4caV9r/yMms1mvPXWW4iIiICHhwfuueceHD58uGYbJ07os88+EwCybds2GTZsmHh4eMjp06eV+b1795YOHTooz9PT0wWAfPbZZ5WWBUCmTp2qPJ86daoAkK5du8qgQYNkzpw5kpCQIADk/fffl7Zt28rzzz8vc+bMkVtvvVUAyK+//qq8fsOGDQJAOnXqJJ07d5b3339fXnzxRfHw8JA2bdrIxYsXlbLJycmi1WolNjZW3nvvPfnggw+kc+fOotVqZcuWLZXq1L59e3nwwQdlzpw5Mnv27Ku2T0ZGhoSGhoqvr6+8/PLL8v7770uXLl1ErVbLt99+KyIiR44ckUWLFgkAuffee2XRokWyaNGiqy5v0aJFEhQUJF27dlXK5ufnK+9Fenq61Wss7bBhwwZlWmJiojRr1uyq7W82myUyMlIGDBhQqQ79+vWTqKioq26ziEheXp507NhRNBqNPPPMMzJ37lx54403pHv37rJz504REbl48aK0a9dO3N3dZdy4cfLhhx/K7bffLgBk5syZyrKGDRsm/v7+UlxcbLWOzz//XPnsiYiYTCbp06ePeHl5ydixY+WTTz6RUaNGiZubmzz44IOVtrVdu3YSHBws06ZNk9mzZ8vOnTslIyNDIiIiJDIyUl5//XWZO3euPPDAAwJAPvjgA+X1Fy9elDZt2oiHh4dMnDhRZs6cKTExMdK5c+dKbV2V6q5n8+bNotFoZNy4ccq0xx9/XDw9PeXgwYPKtP/+979y++23y+uvvy7z58+XMWPGiKenp/To0UPMZrNSriH3qao+YyNGjBA3Nzd55plnZN68eTJp0iTx9vaW7t27S0lJiYiIZGZmSkBAgLRp00beffdd+fTTT+Xll1+Wdu3aXbNNRUT69+8vjz32mLz77rsyd+5cefTRRwWA/Pvf/1bKFBcXS4sWLSQ8PFzefPNNWbBggUybNk26d+8ux44du+byX3jhBenXr5+8/fbb8sknn8jw4cNFo9HII488cs3X/fXXX/LBBx8IABk0aJAsWrRIvvvuu6u2k8jl96oiANKlSxdp3LixvPHGGzJz5kxp2bKleHl5SXZ2tlLu9OnTEh4eruwL8+bNk1dffVXatWsnFy5ckCNHjsjo0aMFgLz00kvK90hGRoaIlH9v9+7dW1ledfdVy/f7TTfdJK1atZL//Oc/8s4770hQUJBEREQo7/HV2Pr7smKdunbtKu3bt5f3339fXnnlFdFqtdKrVy956aWX5JZbbpEPP/xQRo8eLSqVSoYOHWq1jOp8rkRE3n33XQEgP/zwg4iI5OfnS1RUlLRv316Kioquue1Vvd9vvvmmqFQqGThwoMyZM0emTZsmQUFB0rx5c7lw4YJSrnfv3hIeHi6RkZEyZswYmTNnjtx9990CQFavXq2Uy8/Pl5YtW4qnp6e8+OKLMnPmTOnRo4d06dLF6nvr559/lq5du0pQUJDSvpbPq+U74KabbpKYmBj54IMP5LXXXhMvLy/p0aPHNbfxSk4fio4cOSJubm4yevRoZb4tQtGzzz6rTCsrK5OIiAhRqVQyY8YMZfqFCxfE09NTEhMTlWmWN69JkyZiNBqV6cuWLRMAMmvWLBEpDwCtW7eW+Ph4qx+QixcvSosWLeTee++tVKdBgwZVq33Gjh0rAOS3335TpuXl5UmLFi2kefPmYjKZrLY/KSmpWstt1qyZJCQkWE2zZSgSEZk8ebLodDrJzc1VpmVlZYmbm5tVuapMmTJFAChfZBVZ2njmzJkCQL744gtlXklJicTGxoqPj4/ynq1du1YAyIoVK6yW069fP2nZsqXyfNGiRaJWq63aWkRk3rx5AkD++OMPq21Vq9Wyd+9eq7LDhw+Xxo0bW/3AiJQHET8/P+VH31L3ZcuWKWU
2023-03-14 01:38:22 -04:00
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
2023-03-13 17:34:26 -04:00
}
],
"source": [
2023-03-15 14:11:01 -04:00
"plt.plot([len([l for l in lens if l <= m]) for m in range(max(lens) + 1)])\n",
"plt.title(\"Number of fully covered examples as a function of max length\")\n",
"plt.axvline(x=256, color=\"red\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Percentage of tokens left out:"
2023-03-14 01:38:22 -04:00
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
2023-03-15 14:11:01 -04:00
"<matplotlib.lines.Line2D at 0x7f6eef392020>"
2023-03-14 01:38:22 -04:00
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
2023-03-15 14:11:01 -04:00
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAhYAAAGzCAYAAABzfl4TAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAABHN0lEQVR4nO3deVxU9f4/8NfMwAz7sIMoAqJprplb5q6UmllWWnnN1G7eb2WlZZv1q7Qytc26btmmZaVmN225mbt5K/dyV8IVXFGRHQaYef/+gBkZBpCBmTkw83o+nIfMmbO8z4eBeXHO53yOSkQERERERA6gVroAIiIich8MFkREROQwDBZERETkMAwWRERE5DAMFkREROQwDBZERETkMAwWRERE5DAMFkREROQwDBZERETkMAwWbkKlUuHxxx9XugzyICUlJXjuuecQGxsLtVqNYcOGKV1SpRYvXgyVSoWTJ08qXUq13n77bTRr1gwajQY33HCD0uXUycmTJ6FSqbB48WKlS7mm+Ph43H777UqX4VYYLBSkUqlq9Ni8ebPSpRLZ+Oyzz/D2229j+PDh+Pzzz/HUU08pWs+bb76JVatWKVpDba1duxbPPfccevTogUWLFuHNN99UuiS3cujQIUydOrXeh0t34aV0AZ5syZIlVs+/+OILrFu3zmb69ddf78qyiGpk48aNaNy4MWbPnq10KQBKg8Xw4cNtjpyMHj0a999/P3Q6nTKF1cDGjRuhVqvx6aefQqvVKl2O2zl06BCmTZuGvn37Ij4+Xuly3B6DhYIeeOABq+fbtm3DunXrbKaTc+Xl5cHf31/pMhqc9PR0BAcHK13GNWk0Gmg0GqXLqFZ6ejp8fX0ZKsgt8FRIPZeXl4fJkycjNjYWOp0OLVu2xDvvvIOa3JT2jTfegFqtxpw5cyzTVq9ejV69esHf3x+BgYEYMmQIDh48aLXc2LFjERAQgDNnzmDYsGEICAhAREQEnnnmGRiNxhrVvXr1avTp0weBgYEICgpCly5d8PXXX1vNs2LFCnTq1Am+vr4IDw/HAw88gDNnzlhef+edd6BSqXDq1Cmb9U+ZMgVarRZXrlyxTNu+fTsGDRoEvV4PPz8/9OnTB7///rvVclOnToVKpcKhQ4fwj3/8AyEhIejZsycAYN++fRg7diyaNWsGHx8fREdH46GHHsLly5dttr9582Z07twZPj4+SExMxMKFCy3rrujLL7+07GdoaCjuv/9+pKWlXbMNT506hcceewwtW7aEr68vwsLCMGLECJvDucXFxZg2bRpatGgBHx8fhIWFoWfPnli3bl2168/IyMAzzzyDdu3aISAgAEFBQRg8eDD27t1b7XLm8+ebNm3CwYMHrU7Zbd68udLTd5Wdc7fnfWYymfDBBx+gXbt28PHxQUREBAYNGoRdu3YBKD2tmJeXh88//9xSz9ixYwFU3cdi/vz5aNOmDXQ6HWJiYjBhwgRkZmZazdO3b1+0bdsWhw4dQr9+/eDn54fGjRvjrbfeqraNzEpKSvD6668jMTEROp0O8fHxePHFF2EwGCzzqFQqLFq0CHl5eZbaq+ubYK5p37596NOnD/z8/NC8eXN8++23AIBff/0V3bp1g6+vL1q2bIn169dbLV+T95WIoF+/foiIiEB6erplelFREdq1a4fExETk5eXVqA3KO3LkCIYPH47Q0FD4+Pigc+fO+OGHH6zmMX+/fv/9dzz99NOIiIiAv78/7rrrLly8eNFqXpPJhKlTpyImJgZ+fn7o168fDh06hPj4eKvv/4gRIwAA/fr1q/IU82+//YauXbvCx8cHzZo1wxdffGH3/lEZoXpjwoQJUv5bYjKZpH///qJSqeThhx+WuXPnytChQwWATJo0yWpZADJhwgTL85deeklUKpV89NFHlmlffPGFqFQqGTRokMyZM0dmzZol8fHxEhwcLCdOnLDMN2bMGPHx8ZE2bdrIQw89JAsWLJB77rlHAMj8+fOvuR+LFi0SlUolbdu2lenTp8u8efPk4YcfltGjR1vNA0C6dOkis2fPlhdeeEF8fX0lPj5erly5IiIip06dEpVKJW+99ZbNNpo1ayZDhgyxPN+wYYNotVrp3r27vPvuuzJ79mxp3769aLVa2b59u2W+V199VQBI69at5c4775T58+fLvHnzRETknXfekV69eslrr70mH330kUycOFF8fX2la9euYjKZLOv4888/RafTSXx8vMycOVOmT58uMTEx0qFDB6n4I/XGG2+ISqWS++67T+bPny/Tpk2T8PBwq/2syooVK6RDhw7yyiuvyEcffSQvvviihISESFxcnOTl5Vnme/HFF0WlUsn48ePl448/lnfffVdGjhwpM2fOrHb9O3fulMTERHnhhRdk4cKF8tprr0njxo1Fr9fLmTNnqlwuNzdXlixZIq1atZImTZrIkiVLZMmSJXL+/HnZtGmTAJBNmzZZLXPixAkBIIsWLbJMs+d9NnbsWAEggwcPlvfff1/eeecdufPOO2XOnDkiIrJkyRLR6XTSq1cvSz1//PGHiFx9r5V/j5vfB0lJSTJnzhx5/PHHRaPRSJcuXaSoqMgyX58+fSQmJkZiY2Nl4sSJMn/+fOnfv78AkJ9//rna9jXvIwAZPny4zJs3Tx588EEBIMOGDbPMs2TJEunVq5fodDpL7ceOHatyneVrevbZZ2XOnDnSunVr0Wg0smzZMomOjpapU6fK+++/b/l+ZmdnW5av6fvq+PHjEhAQIHfddZdl2gsvvCAqlUp+/fXXave7su/3gQMHRK/XS+vWrWXWrFkyd+5c6d27t6hUKvnuu+8s85m/Xx07dpT+/fvLnDlzZPLkyaLRaOTee++12s5zzz0nAGTo0KEyd+5cGT9+vDRp0kTCw8NlzJgxIiJy7NgxefLJJwWAvPjii1bvVxGRuLg4admypURFRcmLL74oc+fOlRtvvFFUKpUcOHCg2v2kyjFY1CMVg8WqVasEgLzxxhtW8w0fPlxUKpUcPXrUMq18sJg8ebKo1WpZvHix5fWcnBwJDg6W8ePHW63r/Pnzotfrraabfxm+9tprVvN27NhROnXqVO0+ZGZmSmBgoHTr1k0KCgqsXjN/OBcVFUlkZKS0bdvWap6ffvpJAMgrr7ximda9e3ebbe7YsUMAyBdffGFZb4sWLWTgwIFWASA/P18SEhLklltusUwzf6CMHDnSpvb8/HybaUuXLhUAsmXLFsu0oUOHip+fn9WHb0pKinh5eVl9/06ePCkajUamT59utc79+/eLl5eXzfSa1LN161arfRcR6dChg1XIqqnCwkIxGo1W006cOCE6nc7me1+ZPn36SJs2baym2RssavI+27hxowCQJ5980qaG8t9vf39/y4dJeRWDRXp6umi1Wrn11lut9n/u3LkCQD777DOrfazY3gaDQaKjo+Wee+6xbZRy9uzZIwDk4Ycftpr+zDPPCADZuHGjVVv4+/tXu76KNX399deWaUeOHBEAolarZdu2bZbpa9assWn3mr6vREQWLlwoAOTLL7+Ubdu2iUajsfmjpjKVfb8HDBgg7dq1k8LCQss0k8kkN998s7Ro0cIyzfz9SkpKsvr+PvXUU6LRaCQzM1NESn93eXl5WYU0EZGpU6cKAKv3wooVKyp9X4qUBouKP+Pp6emi0+lk8uTJ19xXssVTIfXYzz//DI1GgyeffNJq+uTJkyEiWL16tdV0EcHjjz+ODz74AF9++SXGjBljeW3dunXIzMzEyJEjcenSJctDo9GgW7du2LRpk832H3nkEavnvXr1wvHjx6uted26dcjJycELL7wAHx8fq9fMpwl27dqF9PR0PPbYY1bzDBkyBK1atcJ///tfy7T77rsPu3fvxrFjxyzTli9fDp1OhzvvvBMAsGfPHqSkpOAf//gHLl++bNm3vLw8DBgwAFu2bIHJZKp23wDA19fX8nVhYSEuXbqEm266CQDw559/AgCMRiPWr1+PYcOGISYmxjJ/8+bNMXjwYKv1fffddzCZTLj33nut2jw6OhotWrSotM2rqqe4uBiXL19G8+bNERwcbKkHAIKDg3H
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
2023-03-14 01:38:22 -04:00
}
],
"source": [
2023-03-15 14:11:01 -04:00
"plt.plot([sum(min(l, m) for l in lens) for m in range(max(lens) + 1)])\n",
"plt.title(\"Token coverage as a function of max length\")\n",
"plt.axvline(x=256, color=\"red\")"
2023-03-13 17:34:26 -04:00
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "dl3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
},
2023-03-14 01:38:22 -04:00
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "90bfda469df5ac7fed8d7e225d563f60a7a7aa420ccfadb091c914debf775e49"
}
}
2023-03-13 17:34:26 -04:00
},
"nbformat": 4,
"nbformat_minor": 2
}