DISARMframeworks/CODE/check_new_table_updates.ipynb
Sara-Jayne Terp 1bc8d88b63 moved to datasets as CSVs
Changed from data held in excelfiles to data held in CSV files.  This gives us a better view of what's changed in the datasets when we push them to git.
2022-08-25 09:50:52 -04:00

274 lines
6.4 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Check new and old spreadsheets of data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# make sure we didn't miss anything in mike's original sheet\n",
"def check_diffs(col1, col2):\n",
" xx = col1.value_counts().index\n",
" yy = col2.value_counts().index\n",
" print('lengths are: {} and {}'.format(len(xx), len(yy)))\n",
" print('first list: {}'.format(xx))\n",
" print('second list: {}'.format(yy))\n",
" print('set difference: {}'.format(set(xx).difference(set(yy))))\n",
"\n",
"def check_ids(df, col='DISARM ID'):\n",
" print('null ids: {}'.format(df[df[col].isnull()]))\n",
" vc = df[col].value_counts().reset_index()\n",
" print('repeated ids: {}'.format(vc[vc[col]>1]))\n",
" return\n",
"\n",
"def check_descriptions(df1, df2):\n",
" merge = df1.merge(df2, left_on='DISARM ID', right_on='disarm_id')\n",
" return(merge[merge['summary'] != merge['Description']])\n",
"\n",
"df = pd.read_csv('DISARM Merged (Internal) - DISARM Framework Merged.csv')\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"phases = df[['DISARM Phase', 'DISARM Tactic', 'Description', 'DISARM ID']]\n",
"phases = phases[phases['DISARM Tactic'].isnull()]\n",
"phases"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"check_diffs(phases['DISARM Phase'], df['DISARM Phase'])\n",
"check_ids(phases)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tactics = df[['DISARM Tactic', 'DISARM Technique', 'Description', 'DISARM ID']]#['DISARM Tactic'].notnull()\n",
"tactics = tactics[(tactics['DISARM Tactic'].notnull()) & (tactics['DISARM Technique'].isnull())]\n",
"tactics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"check_diffs(tactics['DISARM Tactic'], df['DISARM Tactic'])\n",
"check_ids(tactics)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"techs = df[['DISARM Technique', 'DISARM Subtechnique', 'Description', 'DISARM ID']]\n",
"techs = techs[(techs['DISARM Technique'].notnull()) & (techs['DISARM Subtechnique'].isnull())]\n",
"techs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"check_diffs(techs['DISARM Technique'], df['DISARM Technique'])\n",
"check_ids(techs)\n",
"#techs.to_csv('temp_techs.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"subs = df[['DISARM Subtechnique', 'DISARM Technique', 'Description', 'DISARM ID']]\n",
"subs = subs[subs['DISARM Subtechnique'].notnull()]\n",
"subs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sx = subs.merge(techs, on='DISARM Technique')\n",
"sx.to_csv('tmp_subs.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"check_diffs(subs['DISARM Subtechnique'], df['DISARM Subtechnique'])\n",
"check_ids(subs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# check against originals"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"MASTERDATA_DIR = '../DISARM_MASTER_DATA/'\n",
"frameworkfile = MASTERDATA_DIR + 'DISARM_FRAMEWORKS_MASTER.xlsx'\n",
"\n",
"def create_excel_metadata(frameworkfile):\n",
" metadata = {}\n",
" xlsx = pd.ExcelFile(frameworkfile)\n",
" for sheetname in xlsx.sheet_names:\n",
" metadata[sheetname] = xlsx.parse(sheetname)\n",
" metadata[sheetname].fillna('', inplace=True)\n",
" metadata[sheetname].rename(columns={'amitt_id': 'disarm_id'}, inplace=True)\n",
" return metadata\n",
"\n",
"metadata = create_excel_metadata(frameworkfile)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"metadata['phases']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"phases"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# do union of both tables, to get new spreadsheet table\n",
"check_descriptions(phases, metadata['phases'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"check_diffs(metadata['phases']['name'], phases['DISARM Phase'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"check_descriptions(tactics, metadata['tactics'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"check_diffs(metadata['tactics']['name'], tactics['DISARM Tactic'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"check_diffs(metadata['techniques']['name'], techs['DISARM Technique'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}