diff --git a/data_science/export_csv_reports.py b/data_science/export_csv_reports.py deleted file mode 100644 index 116658c..0000000 --- a/data_science/export_csv_reports.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python -""" - Export data in a CSV spreadsheet. - - Marina von Steinkirch - 2017 - - Need to have argparse installed: - $ pip install argparse -""" - -import sys -import argparse -from pandas import DataFrame - - -def read_data(data): - lines = data.readlines() - - feature, precision, recall, f1 = [], [], [], [] - for line in lines: - line_clean = line.strip().split(",") - feature.append(line_clean[0]) - precision.append(line_clean[1]) - recall.append(line_clean[4]) - f1.append(line_clean[6]) - return feature, precision, recall, f1 - - -def save_to_spreadsheet(resultfile, data): - try: - df = DataFrame({'Feature': data[0], 'Precision': data[1], 'Recall': data[2], 'f1-score': data[3]}) - df.to_csv(resultfile, index=False) - print("Spreadsheet saved at {0}".format(resultfile)) - except: - print("Error: {0}".format(sys.exc_info()[0])) - - -def menu(): - parser = argparse.ArgumentParser(description='Copy data results into a spreadsheet.') - parser.add_argument('-s', dest='input', type=argparse.FileType('r'), required=True, help="File with the results.") - parser.add_argument('-d', dest='output', required=True, help="The name of the file to save the spreadsheet.") - args = parser.parse_args() - args.input, args.output - return args.input, args.output - - -if __name__ == "__main__": - datafile, resultfile = menu() - data = read_data(datafile) - save_to_spreadsheet(resultfile, data) diff --git a/data_science/export_results.py b/data_science/export_results.py deleted file mode 100755 index 0976031..0000000 --- a/data_science/export_results.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python -# Need to have argparse installed: -# $ pip install argparse - -import sys -import os.path -import argparse -from pandas import DataFrame - - -def read_data(data): - lines = data.readlines() - - feature, precision, recall, f1 = [], [], [], [] - for line in lines: - line_clean = line.strip().split(",") - feature.append(line_clean[0]) - precision.append(line_clean[1]) - recall.append(line_clean[4]) - f1.append(line_clean[6]) - return feature, precision, recall, f1 - - -def save_to_spreadsheet(resultfile, data): - try: - df = DataFrame({'Feature': data[0], 'Precision': data[1], 'Recall': data[2], 'f1-score': data[3]}) - df.to_csv(resultfile, index=False) - print("Spreadsheet saved at {0}".format(resultfile)) - except: - print("Error: {0}".format(sys.exc_info()[0])) - - -def menu(): - parser = argparse.ArgumentParser(description='Copy data results into a spreadsheet.') - parser.add_argument('-s', dest='input', type=argparse.FileType('r'), required=True, help="File with the results.") - parser.add_argument('-d', dest='output', required=True, help="The name of the file to save the spreadsheet.") - args = parser.parse_args() - args.input, args.output - return args.input, args.output - - -if __name__ == "__main__": - datafile, resultfile = menu() - data = read_data(datafile) - save_to_spreadsheet(resultfile, data) diff --git a/data_science/runEval.py b/data_science/runEval.py deleted file mode 100644 index fec0c1a..0000000 --- a/data_science/runEval.py +++ /dev/null @@ -1,225 +0,0 @@ -#!/usr/bin/env python - -""" - Run svm_light, parse its stdout, calculate - ML scores, HDFS copy data to local. -""" - -import sys -import os -import getpass -import subprocess -import shutil -import math - - -def delete_dir(dir_path): - ''' - Remove a directory. - - Args: - dir_path: full path to the directory. - ''' - if os.path.isdir(dir_path): - shutil.rmtree(dir_path) - - -def usage(): - ''' - Handle the CLI arguments. - ''' - args = sys.argv - if len(args) != 3: - print("Usage: ./runEval ") - sys.exit(2) - return args[1], args[2] - - -def create_dir(dir_path): - ''' - Create a a directory. - - Args: - dir_path: full path to the directory. - ''' - if not os.path.exists(dir_path): - os.makedirs(dir_path) - - -def run_svm_classify(test_data, svml_model, svml_eval): - ''' - Spawn a subprocess to run svm_classify binary. - - From svm_classify.c, svm_light usage requires the following - arguments: example_file model_file output_file. - - Args: - test_data: path_to_feature/test.dat - svml_model: something like ~/data/models/svmlight/method/version/model - svml_eval: something like ~/data/models/svmlight/method/version/eval - - Returns: - Strings with stdout and stderr so that it can be parsed later. - ''' - p = subprocess.Popen(['./models/svm_classify', \ - '{0}'.format(test_data), \ - '{0}'.format(svml_model),\ - '{0}'.format(svml_eval)],\ - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - out, err = p.communicate() - return out, err - - -def paste_data(test_data, svml_eval, final_eval, svml_alpha, final_alphas, out): - ''' - Copy all eval and alpha data from results to local files. - - Args: - src and dst paths. - ''' - # Copy all eval data. - with open(test_data, 'r') as ft: - test_data = ft.readlines() - - with open(svml_eval, 'r') as fe: - eval_data = fe.readlines() - - with open(final_eval, 'a') as f: - for line in test_data: - f.write('{0}\n'.format(line)) - for line in eval_data: - f.write('{0}\n'.format(line)) - - # Copy all alpha data. - with open(svml_alpha, 'r') as fa: - alpha_data = fa.readlines() - - with open(final_alphas, 'a') as f: - for line in alpha_data: - f.write('{0} {1}\n'.format(line, out)) - - -def parse_svmlight_output(out): - ''' - Parse the svm_light stdout string for an example - - Returns: - c: counts - p: precision - r: recall - ''' - c = out.split('OK. (')[1].split(' support')[0] - pr = out.split('Precision/recall on test set: ')[1].split(' support')[0].strip() - p, r = pr.split('/') - p = float(p.strip('%').strip()) / 100 - r = float(r.strip('%').strip()) / 100 - - return c, p, r - - -def hdfs_copy_data(home_dir, method, version): - ''' - Run CLI HDFS commands to clean up and save data. - ''' - os.system('hdfs dfs -rm /data/shared/structdata/modelOutput/{0}/{1}/scores'.format(method, version)) - os.system('hdfs dfs -rm /data/shared/structdata/modelOutput/{0}/{1}/alphas'.format(method, version)) - - os.system('hdfs dfs -mkdir /data/shared/structdata/modelOutput/{0}/{1}'.format(method, version)) - - os.system('hdfs dfs -copyFromLocal {0}/data/eval/{1}/{2}/alphas \ - /data/shared/structdata/modelOutput/{3}/{4}/alphas'.format(home_dir, version, method, method, version)) - - os.system('hdfs dfs -copyFromLocal {0}/data/eval/{1}/{2}/eval \ - /data/shared/structdata/modelOutput/{3}/{4}/scores'.format(home_dir, version, method, method, version)) - -def calculate_scores(list_of_scores): - ''' - Calculate the mean of a given list of scores, - taking care of any nan or 0 division. - ''' - c, score = 0, 0 - for i in list_of_scores: - if not math.isnan(i): - c += 1 - score += i - if c > 0: - return score / c - else: - return 0 - - -def calculate_f1(precision, recall): - ''' - Calculates the f1-score as the harmonic - mean of precision and recall. - ''' - if precision + recall < 1: - return 0 - else: - return 2 / (1/precision + 1/recall) - - -if __name__ == '__main__': - - # Grab the CLI arguments. - METHOD, VERSION = usage() - - # Setup output dirs. - home_dir = os.path.join('/home', getpass.getuser()) - final_dir = os.path.join(home_dir, 'data/eval', VERSION, METHOD) - final_alphas = os.path.join(final_dir, 'alphas') - final_eval = os.path.join(final_dir, 'eval') - - delete_dir(final_alphas) - delete_dir(final_eval) - create_dir(final_dir) - - # Loop over the attributes and features. - training_data_dir = os.path.join(home_dir, 'data/training_data/', VERSION, METHOD) - - for attribute in os.listdir(training_data_dir): - - attribute_path = os.path.join(training_data_dir, attribute) - counts = 0 - precision, recall = [], [] - - for feature in os.listdir(attribute_path): - - # Create all the paths in use. - out = os.path.join(VERSION, METHOD, attribute, feature) - svmlight = os.path.join(home_dir,'data/models/svmlight', out) - svml_model = os.path.join(svmlight, 'model') - svml_eval = os.path.join(svmlight, 'eval') - svml_alpha = os.path.join(svmlight, 'alphas') - test_data = os.path.join(attribute_path, feature, 'test.dat') - - # Run svm_classify. - out, err = run_svm_classify(test_data, svml_model, svml_eval) - - # Save current results. - paste_data(test_data, svml_eval, final_eval, svml_alpha, final_alphas, out) - - # Parse output from svm_classify to print to stdout. - if err: - print('Error: {0}'.format(err)) - - # Get Train counts, Test counts, Accuracy, Precision, Recall. - c, p ,r = parse_svmlight_output(out) - - counts += int(c) - precision.append(p) - recall.append(r) - - attribute_precision = calculate_scores(precision) - attribute_recall = calculate_scores(recall) - attribute_f1 = calculate_f1(attribute_precision, attribute_recall) - - print("{: <20} Counts: {: <20} Precision: {: <20} Recall: {: <20} F1-score: {: <20}".format(attribute.title(), \ - counts, round(attribute_precision, 4), round(attribute_recall, 4), round(attribute_f1, 4))) - - - # Copying results from remote hdfs. - print("\nCopying results to hdfs") - hdfs_copy_data(home_dir, METHOD, VERSION) - print("\nDone!".format())