DISARMframeworks/CODE/compare_DISARM_excel_versions.ipynb
Sara-Jayne Terp 1bc8d88b63 moved to datasets as CSVs
Changed from data held in excelfiles to data held in CSV files.  This gives us a better view of what's changed in the datasets when we push them to git.
2022-08-25 09:50:52 -04:00

188 lines
5.9 KiB
Text

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# fix the problem with excelfile changes\n",
"\n",
"Background: DISARM's master dataset is in an excelfile. Changes in this dont' show up in github, so it's difficult to tell what's changed between versions. Code below checks for those differences - use this repeatedly until versions align. "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<ipython-input-5-7e799212f01e>:9: FutureWarning: Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.\n",
" pd.set_option('display.max_colwidth', -1)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"table README is the same\n",
"table examples is the same\n",
"table incidents is the same\n",
"table incidenttechniques is the same\n",
"table narratives is the same\n",
"table externalgroups is the same\n",
"table externalgroupcounters is the same\n",
"table suggested_externalgroups is the same\n",
"table removed_externalgroups is the same\n",
"table forums is the same\n",
"table tools is the same\n",
"table removed_tools is the same\n"
]
}
],
"source": [
"import pandas as pd\n",
"import sqlite3 as sql\n",
"from generate_DISARM_pages import Disarm\n",
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"pd.set_option('display.max_rows', 1000)\n",
"pd.set_option('display.max_colwidth', -1)\n",
"\n",
"newfile = '../DISARM_MASTER_DATA/DISARM_DATA_MASTER.xlsx'\n",
"oldfile = '../../DISARMframeworks_2022_08_10/DISARM_MASTER_DATA/DISARM_DATA_MASTER.xlsx'\n",
"\n",
"# Load dfs from file\n",
"newdfs = {}\n",
"newxlsx = pd.ExcelFile(newfile)\n",
"for sheetname in newxlsx.sheet_names:\n",
" newdfs[sheetname] = newxlsx.parse(sheetname)\n",
" newdfs[sheetname].fillna('', inplace=True)\n",
"\n",
"olddfs = {}\n",
"oldxlsx = pd.ExcelFile(oldfile)\n",
"for sheetname in oldxlsx.sheet_names:\n",
" olddfs[sheetname] = oldxlsx.parse(sheetname)\n",
" olddfs[sheetname].fillna('', inplace=True)\n",
"\n",
"addedtables = newdfs.keys() - olddfs.keys()\n",
"losttables = olddfs.keys() - newdfs.keys()\n",
"if len(addedtables) + len(losttables) > 0:\n",
" print('Table changes: new tables are {}, lost tables are {}'.format(addedtables, losttables))\n",
"\n",
"def investigate_table(table):\n",
" print('\\n\\nTable {} is changed'.format(table))\n",
" # Column headings\n",
" coldiffs = set(newdfs[table].columns).symmetric_difference(set(olddfs[table].columns))\n",
" if len(coldiffs) > 0:\n",
" print('column differences: {}'.format(coldiffs))\n",
" # length\n",
" if len(newdfs[table]) != len(olddfs[table]):\n",
" print('length differences: new {} old {}'.format(len(newdfs[table]), len(olddfs[table])))\n",
"\n",
" # column by column\n",
" for column in newdfs[table].columns:\n",
" coldiffs = newdfs[table][column] != olddfs[table][column]\n",
" if len(newdfs[table][coldiffs]) > 0:\n",
" print('Differences in column {}'.format(column))\n",
" return\n",
"\n",
"for table in newdfs.keys():\n",
" if newdfs[table].equals(olddfs[table]) == False:\n",
" investigate_table(table)\n",
" else:\n",
" print('table {} is the same'.format(table))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>disarm_id</th>\n",
" <th>new</th>\n",
" <th>old</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [disarm_id, new, old]\n",
"Index: []"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Look at individual table differences\n",
"table = 'countermeasures'\n",
"column = 'summary'\n",
"coldiffs = newdfs[table][column] != olddfs[table][column]\n",
"diffcols = pd.DataFrame()\n",
"diffcols['disarm_id'] = newdfs[table][coldiffs]['disarm_id']\n",
"diffcols['new'] = newdfs[table][coldiffs][column]\n",
"diffcols['old'] = olddfs[table][coldiffs][column]\n",
"diffcols[diffcols['old'] != '']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}