Copy AMITT repository, clean up and rebrand

Took a copy of the current AMITT github repository - we'll be updating this and merging the SPICE branch back in
Rebranded to DISARM
Moved generated pages to their own folder, to make looking at the repository less confusing
This commit is contained in:
Sara-Jayne Terp 2022-01-29 11:34:46 -05:00
commit 22abaf93d8
448 changed files with 58066 additions and 0 deletions

View file

@ -0,0 +1,110 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# fix the problem with excelfile changes\n",
"\n",
"Background: DISARM's master dataset is in an excelfile. Changes in this dont' show up in github, so it's difficult to tell what's changed between versions. Code below checks for those differences - use this repeatedly until versions align. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import sqlite3 as sql\n",
"from generate_DISARM_pages import Disarm\n",
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"pd.set_option('display.max_rows', 1000)\n",
"pd.set_option('display.max_colwidth', -1)\n",
"\n",
"newfile = '../DISARM_MASTER_DATA/DISARM_FRAMEWORKS_MASTER.xlsx'\n",
"oldfile = '../DISARM_MASTER_DATA/DISARM_FRAMEWORKS_MASTER_previous_version.xlsx'\n",
"\n",
"# Load dfs from file\n",
"newdfs = {}\n",
"newxlsx = pd.ExcelFile(newfile)\n",
"for sheetname in newxlsx.sheet_names:\n",
" newdfs[sheetname] = newxlsx.parse(sheetname)\n",
" newdfs[sheetname].fillna('', inplace=True)\n",
"\n",
"olddfs = {}\n",
"oldxlsx = pd.ExcelFile(oldfile)\n",
"for sheetname in oldxlsx.sheet_names:\n",
" olddfs[sheetname] = oldxlsx.parse(sheetname)\n",
" olddfs[sheetname].fillna('', inplace=True)\n",
"\n",
"addedtables = newdfs.keys() - olddfs.keys()\n",
"losttables = olddfs.keys() - newdfs.keys()\n",
"if len(addedtables) + len(losttables) > 0:\n",
" print('Table changes: new tables are {}, lost tables are {}'.format(addedtables, losttables))\n",
"\n",
"def investigate_table(table):\n",
" print('\\n\\nTable {} is changed'.format(table))\n",
" # Column headings\n",
" coldiffs = set(newdfs[table].columns).symmetric_difference(set(olddfs[table].columns))\n",
" if len(coldiffs) > 0:\n",
" print('column differences: {}'.format(coldiffs))\n",
" # length\n",
" if len(newdfs[table]) != len(olddfs[table]):\n",
" print('length differences: new {} old {}'.format(len(newdfs[table]), len(olddfs[table])))\n",
"\n",
" # column by column\n",
" for column in newdfs[table].columns:\n",
" coldiffs = newdfs[table][column] != olddfs[table][column]\n",
" if len(newdfs[table][coldiffs]) > 0:\n",
" print('Differences in column {}'.format(column))\n",
" return\n",
"\n",
"for table in newdfs.keys():\n",
" if newdfs[table].equals(olddfs[table]) == False:\n",
" investigate_table(table)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Look at individual table differences\n",
"table = 'countermeasures'\n",
"column = 'summary'\n",
"coldiffs = newdfs[table][column] != olddfs[table][column]\n",
"diffcols = pd.DataFrame()\n",
"diffcols['amitt_id'] = newdfs[table][coldiffs]['amitt_id']\n",
"diffcols['new'] = newdfs[table][coldiffs][column]\n",
"diffcols['old'] = olddfs[table][coldiffs][column]\n",
"diffcols[diffcols['old'] != '']"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}