{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# fix the problem with excelfile changes\n", "\n", "Background: DISARM's master dataset is in an excelfile. Changes in this dont' show up in github, so it's difficult to tell what's changed between versions. Code below checks for those differences - use this repeatedly until versions align. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import sqlite3 as sql\n", "from generate_DISARM_pages import Disarm\n", "import pandas as pd\n", "import numpy as np\n", "import os\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "pd.set_option('display.max_rows', 1000)\n", "pd.set_option('display.max_colwidth', -1)\n", "\n", "newfile = '../DISARM_MASTER_DATA/DISARM_FRAMEWORKS_MASTER.xlsx'\n", "oldfile = '../DISARM_MASTER_DATA/DISARM_FRAMEWORKS_MASTER_previous_version.xlsx'\n", "\n", "# Load dfs from file\n", "newdfs = {}\n", "newxlsx = pd.ExcelFile(newfile)\n", "for sheetname in newxlsx.sheet_names:\n", " newdfs[sheetname] = newxlsx.parse(sheetname)\n", " newdfs[sheetname].fillna('', inplace=True)\n", "\n", "olddfs = {}\n", "oldxlsx = pd.ExcelFile(oldfile)\n", "for sheetname in oldxlsx.sheet_names:\n", " olddfs[sheetname] = oldxlsx.parse(sheetname)\n", " olddfs[sheetname].fillna('', inplace=True)\n", "\n", "addedtables = newdfs.keys() - olddfs.keys()\n", "losttables = olddfs.keys() - newdfs.keys()\n", "if len(addedtables) + len(losttables) > 0:\n", " print('Table changes: new tables are {}, lost tables are {}'.format(addedtables, losttables))\n", "\n", "def investigate_table(table):\n", " print('\\n\\nTable {} is changed'.format(table))\n", " # Column headings\n", " coldiffs = set(newdfs[table].columns).symmetric_difference(set(olddfs[table].columns))\n", " if len(coldiffs) > 0:\n", " print('column differences: {}'.format(coldiffs))\n", " # length\n", " if len(newdfs[table]) != len(olddfs[table]):\n", " print('length differences: new {} old {}'.format(len(newdfs[table]), len(olddfs[table])))\n", "\n", " # column by column\n", " for column in newdfs[table].columns:\n", " coldiffs = newdfs[table][column] != olddfs[table][column]\n", " if len(newdfs[table][coldiffs]) > 0:\n", " print('Differences in column {}'.format(column))\n", " return\n", "\n", "for table in newdfs.keys():\n", " if newdfs[table].equals(olddfs[table]) == False:\n", " investigate_table(table)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Look at individual table differences\n", "table = 'countermeasures'\n", "column = 'summary'\n", "coldiffs = newdfs[table][column] != olddfs[table][column]\n", "diffcols = pd.DataFrame()\n", "diffcols['amitt_id'] = newdfs[table][coldiffs]['amitt_id']\n", "diffcols['new'] = newdfs[table][coldiffs][column]\n", "diffcols['old'] = olddfs[table][coldiffs][column]\n", "diffcols[diffcols['old'] != '']" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }