# Check new and old spreadsheets of data

In [None]:
import pandas as pd

# make sure we didn't miss anything in mike's original sheet
def check_diffs(col1, col2):
 xx = col1.value_counts().index
 yy = col2.value_counts().index
 print('lengths are: {} and {}'.format(len(xx), len(yy)))
 print('first list: {}'.format(xx))
 print('second list: {}'.format(yy))
 print('set difference: {}'.format(set(xx).difference(set(yy))))

def check_ids(df, col='DISARM ID'):
 print('null ids: {}'.format(df[df[col].isnull()]))
 vc = df[col].value_counts().reset_index()
 print('repeated ids: {}'.format(vc[vc[col]>1]))
 return

def check_descriptions(df1, df2):
 merge = df1.merge(df2, left_on='DISARM ID', right_on='disarm_id')
 return(merge[merge['summary'] != merge['Description']])

df = pd.read_csv('DISARM Merged (Internal) - DISARM Framework Merged.csv')
df

In [None]:
phases = df[['DISARM Phase', 'DISARM Tactic', 'Description', 'DISARM ID']]
phases = phases[phases['DISARM Tactic'].isnull()]
phases

In [None]:
check_diffs(phases['DISARM Phase'], df['DISARM Phase'])
check_ids(phases)

In [None]:
tactics = df[['DISARM Tactic', 'DISARM Technique', 'Description', 'DISARM ID']]#['DISARM Tactic'].notnull()
tactics = tactics[(tactics['DISARM Tactic'].notnull()) & (tactics['DISARM Technique'].isnull())]
tactics

In [None]:
check_diffs(tactics['DISARM Tactic'], df['DISARM Tactic'])
check_ids(tactics)

In [None]:
techs = df[['DISARM Technique', 'DISARM Subtechnique', 'Description', 'DISARM ID']]
techs = techs[(techs['DISARM Technique'].notnull()) & (techs['DISARM Subtechnique'].isnull())]
techs

In [None]:
check_diffs(techs['DISARM Technique'], df['DISARM Technique'])
check_ids(techs)
#techs.to_csv('temp_techs.csv', index=False)

In [None]:
subs = df[['DISARM Subtechnique', 'DISARM Technique', 'Description', 'DISARM ID']]
subs = subs[subs['DISARM Subtechnique'].notnull()]
subs

In [None]:
sx = subs.merge(techs, on='DISARM Technique')
sx.to_csv('tmp_subs.csv', index=False)

In [None]:
check_diffs(subs['DISARM Subtechnique'], df['DISARM Subtechnique'])
check_ids(subs)

# check against originals

In [None]:
MASTERDATA_DIR = '../DISARM_MASTER_DATA/'
frameworkfile = MASTERDATA_DIR + 'DISARM_FRAMEWORKS_MASTER.xlsx'
oldframefile = MASTERDATA_DIR + '2022-06-30_DISARM_0_1/DISARM_FRAMEWORKS_MASTER_0_1.xlsx'

def create_excel_metadata(frameworkfile):
 metadata = {}
 xlsx = pd.ExcelFile(frameworkfile)
 for sheetname in xlsx.sheet_names:
 metadata[sheetname] = xlsx.parse(sheetname)
 metadata[sheetname].fillna('', inplace=True)
 return metadata

metadata = create_excel_metadata(frameworkfile)
oldmeta = create_excel_metadata(oldframefile)
print('Current keys: {}'.format(metadata.keys()))
print('Old keys: {}'.format(oldmeta.keys()))

In [None]:
metadata['phases']

In [None]:
phases

In [None]:
# do union of both tables, to get new spreadsheet table
check_descriptions(phases, metadata['phases'])

In [None]:
check_diffs(metadata['phases']['name'], phases['DISARM Phase'])

In [None]:
check_descriptions(tactics, metadata['tactics'])

In [None]:
check_diffs(metadata['tactics']['name'], tactics['DISARM Tactic'])

In [None]:
check_diffs(metadata['techniques']['name'], techs['DISARM Technique'])

# Check against last version of framework

In [None]:
def check_version_descriptions(df1, df2):
 merge = df1.merge(df2, on='disarm_id')
 merge['name_changed'] = merge['name_x'] != merge['name_y']
 merge['summary_changed'] = merge['summary_x'] != merge['summary_y']
 return merge[(merge['name_changed']) | (merge['summary_changed'])][['disarm_id', 'name_changed', 'summary_changed', 'name_x', 
 'name_y', 'summary_x', 'summary_y']]

check_version_descriptions(metadata['phases'], oldmeta['phases'])

In [None]:
check_version_descriptions(metadata['tactics'], oldmeta['tactics'])

In [None]:
check_version_descriptions(metadata['techniques'], oldmeta['techniques'])