mirror of
https://github.com/DISARMFoundation/DISARMframeworks.git
synced 2025-07-12 01:29:22 -04:00

Changed from data held in excelfiles to data held in CSV files. This gives us a better view of what's changed in the datasets when we push them to git.
188 lines
5.9 KiB
Text
188 lines
5.9 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# fix the problem with excelfile changes\n",
|
|
"\n",
|
|
"Background: DISARM's master dataset is in an excelfile. Changes in this dont' show up in github, so it's difficult to tell what's changed between versions. Code below checks for those differences - use this repeatedly until versions align. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"<ipython-input-5-7e799212f01e>:9: FutureWarning: Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.\n",
|
|
" pd.set_option('display.max_colwidth', -1)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"table README is the same\n",
|
|
"table examples is the same\n",
|
|
"table incidents is the same\n",
|
|
"table incidenttechniques is the same\n",
|
|
"table narratives is the same\n",
|
|
"table externalgroups is the same\n",
|
|
"table externalgroupcounters is the same\n",
|
|
"table suggested_externalgroups is the same\n",
|
|
"table removed_externalgroups is the same\n",
|
|
"table forums is the same\n",
|
|
"table tools is the same\n",
|
|
"table removed_tools is the same\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import sqlite3 as sql\n",
|
|
"from generate_DISARM_pages import Disarm\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import os\n",
|
|
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
|
"pd.set_option('display.max_rows', 1000)\n",
|
|
"pd.set_option('display.max_colwidth', -1)\n",
|
|
"\n",
|
|
"newfile = '../DISARM_MASTER_DATA/DISARM_DATA_MASTER.xlsx'\n",
|
|
"oldfile = '../../DISARMframeworks_2022_08_10/DISARM_MASTER_DATA/DISARM_DATA_MASTER.xlsx'\n",
|
|
"\n",
|
|
"# Load dfs from file\n",
|
|
"newdfs = {}\n",
|
|
"newxlsx = pd.ExcelFile(newfile)\n",
|
|
"for sheetname in newxlsx.sheet_names:\n",
|
|
" newdfs[sheetname] = newxlsx.parse(sheetname)\n",
|
|
" newdfs[sheetname].fillna('', inplace=True)\n",
|
|
"\n",
|
|
"olddfs = {}\n",
|
|
"oldxlsx = pd.ExcelFile(oldfile)\n",
|
|
"for sheetname in oldxlsx.sheet_names:\n",
|
|
" olddfs[sheetname] = oldxlsx.parse(sheetname)\n",
|
|
" olddfs[sheetname].fillna('', inplace=True)\n",
|
|
"\n",
|
|
"addedtables = newdfs.keys() - olddfs.keys()\n",
|
|
"losttables = olddfs.keys() - newdfs.keys()\n",
|
|
"if len(addedtables) + len(losttables) > 0:\n",
|
|
" print('Table changes: new tables are {}, lost tables are {}'.format(addedtables, losttables))\n",
|
|
"\n",
|
|
"def investigate_table(table):\n",
|
|
" print('\\n\\nTable {} is changed'.format(table))\n",
|
|
" # Column headings\n",
|
|
" coldiffs = set(newdfs[table].columns).symmetric_difference(set(olddfs[table].columns))\n",
|
|
" if len(coldiffs) > 0:\n",
|
|
" print('column differences: {}'.format(coldiffs))\n",
|
|
" # length\n",
|
|
" if len(newdfs[table]) != len(olddfs[table]):\n",
|
|
" print('length differences: new {} old {}'.format(len(newdfs[table]), len(olddfs[table])))\n",
|
|
"\n",
|
|
" # column by column\n",
|
|
" for column in newdfs[table].columns:\n",
|
|
" coldiffs = newdfs[table][column] != olddfs[table][column]\n",
|
|
" if len(newdfs[table][coldiffs]) > 0:\n",
|
|
" print('Differences in column {}'.format(column))\n",
|
|
" return\n",
|
|
"\n",
|
|
"for table in newdfs.keys():\n",
|
|
" if newdfs[table].equals(olddfs[table]) == False:\n",
|
|
" investigate_table(table)\n",
|
|
" else:\n",
|
|
" print('table {} is the same'.format(table))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>disarm_id</th>\n",
|
|
" <th>new</th>\n",
|
|
" <th>old</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
"Empty DataFrame\n",
|
|
"Columns: [disarm_id, new, old]\n",
|
|
"Index: []"
|
|
]
|
|
},
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Look at individual table differences\n",
|
|
"table = 'countermeasures'\n",
|
|
"column = 'summary'\n",
|
|
"coldiffs = newdfs[table][column] != olddfs[table][column]\n",
|
|
"diffcols = pd.DataFrame()\n",
|
|
"diffcols['disarm_id'] = newdfs[table][coldiffs]['disarm_id']\n",
|
|
"diffcols['new'] = newdfs[table][coldiffs][column]\n",
|
|
"diffcols['old'] = olddfs[table][coldiffs][column]\n",
|
|
"diffcols[diffcols['old'] != '']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|