moved to datasets as CSVs

Changed from data held in excelfiles to data held in CSV files.  This gives us a better view of what's changed in the datasets when we push them to git.
This commit is contained in:
Sara-Jayne Terp 2022-08-25 09:50:52 -04:00
parent cae9cbc55e
commit 1bc8d88b63
99 changed files with 15137 additions and 14858 deletions

View file

@ -11,9 +11,36 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<ipython-input-5-7e799212f01e>:9: FutureWarning: Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.\n",
" pd.set_option('display.max_colwidth', -1)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"table README is the same\n",
"table examples is the same\n",
"table incidents is the same\n",
"table incidenttechniques is the same\n",
"table narratives is the same\n",
"table externalgroups is the same\n",
"table externalgroupcounters is the same\n",
"table suggested_externalgroups is the same\n",
"table removed_externalgroups is the same\n",
"table forums is the same\n",
"table tools is the same\n",
"table removed_tools is the same\n"
]
}
],
"source": [
"import pandas as pd\n",
"import sqlite3 as sql\n",
@ -25,8 +52,8 @@
"pd.set_option('display.max_rows', 1000)\n",
"pd.set_option('display.max_colwidth', -1)\n",
"\n",
"newfile = '../DISARM_MASTER_DATA/DISARM_FRAMEWORKS_MASTER.xlsx'\n",
"oldfile = '../DISARM_MASTER_DATA/DISARM_FRAMEWORKS_MASTER_previous_version.xlsx'\n",
"newfile = '../DISARM_MASTER_DATA/DISARM_DATA_MASTER.xlsx'\n",
"oldfile = '../../DISARMframeworks_2022_08_10/DISARM_MASTER_DATA/DISARM_DATA_MASTER.xlsx'\n",
"\n",
"# Load dfs from file\n",
"newdfs = {}\n",
@ -65,7 +92,68 @@
"\n",
"for table in newdfs.keys():\n",
" if newdfs[table].equals(olddfs[table]) == False:\n",
" investigate_table(table)"
" investigate_table(table)\n",
" else:\n",
" print('table {} is the same'.format(table))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>disarm_id</th>\n",
" <th>new</th>\n",
" <th>old</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [disarm_id, new, old]\n",
"Index: []"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Look at individual table differences\n",
"table = 'countermeasures'\n",
"column = 'summary'\n",
"coldiffs = newdfs[table][column] != olddfs[table][column]\n",
"diffcols = pd.DataFrame()\n",
"diffcols['disarm_id'] = newdfs[table][coldiffs]['disarm_id']\n",
"diffcols['new'] = newdfs[table][coldiffs][column]\n",
"diffcols['old'] = olddfs[table][coldiffs][column]\n",
"diffcols[diffcols['old'] != '']"
]
},
{
@ -73,17 +161,7 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Look at individual table differences\n",
"table = 'countermeasures'\n",
"column = 'summary'\n",
"coldiffs = newdfs[table][column] != olddfs[table][column]\n",
"diffcols = pd.DataFrame()\n",
"diffcols['amitt_id'] = newdfs[table][coldiffs]['amitt_id']\n",
"diffcols['new'] = newdfs[table][coldiffs][column]\n",
"diffcols['old'] = olddfs[table][coldiffs][column]\n",
"diffcols[diffcols['old'] != '']"
]
"source": []
}
],
"metadata": {