DISARMframeworks/CODE/compare_DISARM_excel_versions.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# fix the problem with excelfile changes\n",
    "\n",
    "Background: DISARM's master dataset is in an excelfile. Changes in this dont' show up in github, so it's difficult to tell what's changed between versions.  Code below checks for those differences - use this repeatedly until versions align. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import sqlite3 as sql\n",
    "from generate_DISARM_pages import Disarm\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "pd.set_option('display.max_rows', 1000)\n",
    "pd.set_option('display.max_colwidth', -1)\n",
    "\n",
    "newfile = '../DISARM_MASTER_DATA/DISARM_FRAMEWORKS_MASTER.xlsx'\n",
    "oldfile = '../DISARM_MASTER_DATA/DISARM_FRAMEWORKS_MASTER_previous_version.xlsx'\n",
    "\n",
    "# Load dfs from file\n",
    "newdfs = {}\n",
    "newxlsx = pd.ExcelFile(newfile)\n",
    "for sheetname in newxlsx.sheet_names:\n",
    "    newdfs[sheetname] = newxlsx.parse(sheetname)\n",
    "    newdfs[sheetname].fillna('', inplace=True)\n",
    "\n",
    "olddfs = {}\n",
    "oldxlsx = pd.ExcelFile(oldfile)\n",
    "for sheetname in oldxlsx.sheet_names:\n",
    "    olddfs[sheetname] = oldxlsx.parse(sheetname)\n",
    "    olddfs[sheetname].fillna('', inplace=True)\n",
    "\n",
    "addedtables = newdfs.keys() - olddfs.keys()\n",
    "losttables = olddfs.keys() - newdfs.keys()\n",
    "if len(addedtables) + len(losttables) > 0:\n",
    "    print('Table changes: new tables are {}, lost tables are {}'.format(addedtables, losttables))\n",
    "\n",
    "def investigate_table(table):\n",
    "    print('\\n\\nTable {} is changed'.format(table))\n",
    "    # Column headings\n",
    "    coldiffs = set(newdfs[table].columns).symmetric_difference(set(olddfs[table].columns))\n",
    "    if len(coldiffs) > 0:\n",
    "        print('column differences: {}'.format(coldiffs))\n",
    "    # length\n",
    "    if len(newdfs[table]) != len(olddfs[table]):\n",
    "        print('length differences: new {} old {}'.format(len(newdfs[table]), len(olddfs[table])))\n",
    "\n",
    "    # column by column\n",
    "    for column in newdfs[table].columns:\n",
    "        coldiffs = newdfs[table][column] != olddfs[table][column]\n",
    "        if len(newdfs[table][coldiffs]) > 0:\n",
    "            print('Differences in column {}'.format(column))\n",
    "    return\n",
    "\n",
    "for table in newdfs.keys():\n",
    "    if newdfs[table].equals(olddfs[table]) == False:\n",
    "        investigate_table(table)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Look at individual table differences\n",
    "table = 'countermeasures'\n",
    "column = 'summary'\n",
    "coldiffs = newdfs[table][column] != olddfs[table][column]\n",
    "diffcols = pd.DataFrame()\n",
    "diffcols['amitt_id'] = newdfs[table][coldiffs]['amitt_id']\n",
    "diffcols['new'] = newdfs[table][coldiffs][column]\n",
    "diffcols['old'] = olddfs[table][coldiffs][column]\n",
    "diffcols[diffcols['old'] != '']"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}