mirror of
https://github.com/DISARMFoundation/DISARMframeworks.git
synced 2025-01-26 06:15:54 -05:00
1bc8d88b63
Changed from data held in excelfiles to data held in CSV files. This gives us a better view of what's changed in the datasets when we push them to git.
274 lines
6.4 KiB
Plaintext
274 lines
6.4 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Check new and old spreadsheets of data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"# make sure we didn't miss anything in mike's original sheet\n",
|
|
"def check_diffs(col1, col2):\n",
|
|
" xx = col1.value_counts().index\n",
|
|
" yy = col2.value_counts().index\n",
|
|
" print('lengths are: {} and {}'.format(len(xx), len(yy)))\n",
|
|
" print('first list: {}'.format(xx))\n",
|
|
" print('second list: {}'.format(yy))\n",
|
|
" print('set difference: {}'.format(set(xx).difference(set(yy))))\n",
|
|
"\n",
|
|
"def check_ids(df, col='DISARM ID'):\n",
|
|
" print('null ids: {}'.format(df[df[col].isnull()]))\n",
|
|
" vc = df[col].value_counts().reset_index()\n",
|
|
" print('repeated ids: {}'.format(vc[vc[col]>1]))\n",
|
|
" return\n",
|
|
"\n",
|
|
"def check_descriptions(df1, df2):\n",
|
|
" merge = df1.merge(df2, left_on='DISARM ID', right_on='disarm_id')\n",
|
|
" return(merge[merge['summary'] != merge['Description']])\n",
|
|
"\n",
|
|
"df = pd.read_csv('DISARM Merged (Internal) - DISARM Framework Merged.csv')\n",
|
|
"df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"phases = df[['DISARM Phase', 'DISARM Tactic', 'Description', 'DISARM ID']]\n",
|
|
"phases = phases[phases['DISARM Tactic'].isnull()]\n",
|
|
"phases"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"check_diffs(phases['DISARM Phase'], df['DISARM Phase'])\n",
|
|
"check_ids(phases)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"tactics = df[['DISARM Tactic', 'DISARM Technique', 'Description', 'DISARM ID']]#['DISARM Tactic'].notnull()\n",
|
|
"tactics = tactics[(tactics['DISARM Tactic'].notnull()) & (tactics['DISARM Technique'].isnull())]\n",
|
|
"tactics"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"check_diffs(tactics['DISARM Tactic'], df['DISARM Tactic'])\n",
|
|
"check_ids(tactics)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"techs = df[['DISARM Technique', 'DISARM Subtechnique', 'Description', 'DISARM ID']]\n",
|
|
"techs = techs[(techs['DISARM Technique'].notnull()) & (techs['DISARM Subtechnique'].isnull())]\n",
|
|
"techs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"check_diffs(techs['DISARM Technique'], df['DISARM Technique'])\n",
|
|
"check_ids(techs)\n",
|
|
"#techs.to_csv('temp_techs.csv', index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"subs = df[['DISARM Subtechnique', 'DISARM Technique', 'Description', 'DISARM ID']]\n",
|
|
"subs = subs[subs['DISARM Subtechnique'].notnull()]\n",
|
|
"subs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"sx = subs.merge(techs, on='DISARM Technique')\n",
|
|
"sx.to_csv('tmp_subs.csv', index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"check_diffs(subs['DISARM Subtechnique'], df['DISARM Subtechnique'])\n",
|
|
"check_ids(subs)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# check against originals"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"MASTERDATA_DIR = '../DISARM_MASTER_DATA/'\n",
|
|
"frameworkfile = MASTERDATA_DIR + 'DISARM_FRAMEWORKS_MASTER.xlsx'\n",
|
|
"\n",
|
|
"def create_excel_metadata(frameworkfile):\n",
|
|
" metadata = {}\n",
|
|
" xlsx = pd.ExcelFile(frameworkfile)\n",
|
|
" for sheetname in xlsx.sheet_names:\n",
|
|
" metadata[sheetname] = xlsx.parse(sheetname)\n",
|
|
" metadata[sheetname].fillna('', inplace=True)\n",
|
|
" metadata[sheetname].rename(columns={'amitt_id': 'disarm_id'}, inplace=True)\n",
|
|
" return metadata\n",
|
|
"\n",
|
|
"metadata = create_excel_metadata(frameworkfile)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"metadata['phases']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"phases"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# do union of both tables, to get new spreadsheet table\n",
|
|
"check_descriptions(phases, metadata['phases'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"check_diffs(metadata['phases']['name'], phases['DISARM Phase'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"check_descriptions(tactics, metadata['tactics'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"check_diffs(metadata['tactics']['name'], tactics['DISARM Tactic'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"check_diffs(metadata['techniques']['name'], techs['DISARM Technique'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|