Fighting for silly scripts that can be easily copied in the internet, instead of doing good for the world. I am just sorry for they way people behave.

2025-11-24 07:53:25 -05:00 · 2018-06-27 11:08:03 -07:00 · 2018-06-27 11:08:03 -07:00 · 69ccfd04d5
commit 69ccfd04d5
parent 2e7dda347c
3 changed files with 0 additions and 320 deletions
--- a/data_science/export_csv_reports.py
+++ b/data_science/export_csv_reports.py
@ -1,50 +0,0 @@
-#!/usr/bin/env python
-"""
-    Export data in a CSV spreadsheet.
-
-    Marina von Steinkirch - 2017
-
-    Need to have argparse installed:
-    $ pip install argparse
-"""
-
-import sys
-import argparse
-from pandas import DataFrame
-
-
-def read_data(data):
-    lines = data.readlines()
-
-    feature, precision, recall, f1 = [], [], [], []
-    for line in lines:
-        line_clean = line.strip().split(",")
-        feature.append(line_clean[0])
-        precision.append(line_clean[1])
-        recall.append(line_clean[4])
-        f1.append(line_clean[6])
-    return feature, precision, recall, f1
-
-
-def save_to_spreadsheet(resultfile, data):
-    try:
-        df = DataFrame({'Feature': data[0], 'Precision': data[1], 'Recall': data[2], 'f1-score': data[3]})
-        df.to_csv(resultfile, index=False)
-        print("Spreadsheet saved at {0}".format(resultfile))
-    except:
-        print("Error: {0}".format(sys.exc_info()[0]))
-
-
-def menu():
-    parser = argparse.ArgumentParser(description='Copy data results into a spreadsheet.')
-    parser.add_argument('-s', dest='input', type=argparse.FileType('r'), required=True, help="File with the results.")
-    parser.add_argument('-d', dest='output', required=True, help="The name of the file to save the spreadsheet.")
-    args = parser.parse_args()
-    args.input, args.output
-    return args.input, args.output
-
-
-if __name__ == "__main__":
-    datafile, resultfile = menu()
-    data = read_data(datafile)
-    save_to_spreadsheet(resultfile, data)
--- a/data_science/export_results.py
+++ b/data_science/export_results.py
@ -1,45 +0,0 @@
-#!/usr/bin/env python
-# Need to have argparse installed:
-# $ pip install argparse
-
-import sys
-import os.path
-import argparse
-from pandas import DataFrame
-
-
-def read_data(data):
-    lines = data.readlines()
-
-    feature, precision, recall, f1 = [], [], [], []
-    for line in lines:
-        line_clean = line.strip().split(",")
-        feature.append(line_clean[0])
-        precision.append(line_clean[1])
-        recall.append(line_clean[4])
-        f1.append(line_clean[6])
-    return feature, precision, recall, f1
-
-
-def save_to_spreadsheet(resultfile, data):
-    try:
-        df = DataFrame({'Feature': data[0], 'Precision': data[1], 'Recall': data[2], 'f1-score': data[3]})
-        df.to_csv(resultfile, index=False)
-        print("Spreadsheet saved at {0}".format(resultfile))
-    except:
-        print("Error: {0}".format(sys.exc_info()[0]))
-
-
-def menu():
-    parser = argparse.ArgumentParser(description='Copy data results into a spreadsheet.')
-    parser.add_argument('-s', dest='input', type=argparse.FileType('r'), required=True, help="File with the results.")
-    parser.add_argument('-d', dest='output', required=True, help="The name of the file to save the spreadsheet.")
-    args = parser.parse_args()
-    args.input, args.output
-    return args.input, args.output
-
-
-if __name__ == "__main__":
-    datafile, resultfile = menu()
-    data = read_data(datafile)
-    save_to_spreadsheet(resultfile, data)
--- a/data_science/runEval.py
+++ b/data_science/runEval.py
@ -1,225 +0,0 @@
-#!/usr/bin/env python
-
-"""
-    Run svm_light, parse its stdout, calculate
-    ML scores, HDFS copy data to local.
-"""
-
-import sys
-import os
-import getpass
-import subprocess
-import shutil
-import math
-
-
-def delete_dir(dir_path):
-    '''
-        Remove a directory.
-
-        Args:
-            dir_path: full path to the directory.
-    '''
-    if os.path.isdir(dir_path):
-        shutil.rmtree(dir_path)
-
-
-def usage():
-    '''
-        Handle the CLI arguments.
-    '''
-    args = sys.argv
-    if len(args) != 3:
-        print("Usage: ./runEval <method> <version>")
-        sys.exit(2)
-    return args[1], args[2]
-
-
-def create_dir(dir_path):
-    '''
-        Create a a directory.
-
-        Args:
-            dir_path: full path to the directory.
-    '''
-    if not os.path.exists(dir_path):
-        os.makedirs(dir_path)
-
-
-def run_svm_classify(test_data, svml_model, svml_eval):
-    '''
-        Spawn a subprocess to run svm_classify binary.
-
-        From svm_classify.c, svm_light usage requires the following
-        arguments: example_file model_file output_file.
-
-        Args:
-            test_data: path_to_feature/test.dat
-            svml_model: something like ~/data/models/svmlight/method/version/model
-            svml_eval: something like ~/data/models/svmlight/method/version/eval
-
-        Returns:
-            Strings with stdout and stderr so that it can be parsed later.
-    '''
-    p = subprocess.Popen(['./models/svm_classify', \
-            '{0}'.format(test_data), \
-            '{0}'.format(svml_model),\
-            '{0}'.format(svml_eval)],\
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
-    out, err = p.communicate()
-    return out, err
-
-
-def paste_data(test_data, svml_eval, final_eval, svml_alpha, final_alphas, out):
-    '''
-        Copy all eval and alpha data from results to local files.
-
-        Args:
-            src and dst paths.
-    '''
-    # Copy all eval data.
-    with open(test_data, 'r') as ft:
-        test_data = ft.readlines()
-
-    with open(svml_eval, 'r') as fe:
-        eval_data = fe.readlines()
-
-    with open(final_eval, 'a') as f:
-        for line in test_data:
-            f.write('{0}\n'.format(line))
-        for line in eval_data:
-            f.write('{0}\n'.format(line))
-
-    # Copy all alpha data.
-    with open(svml_alpha, 'r') as fa:
-        alpha_data = fa.readlines()
-
-    with open(final_alphas, 'a') as f:
-        for line in alpha_data:
-            f.write('{0}     {1}\n'.format(line, out))
-
-
-def parse_svmlight_output(out):
-    '''
-        Parse the svm_light stdout string for an example
-
-        Returns:
-            c: counts
-            p: precision
-            r: recall
-    '''
-    c = out.split('OK. (')[1].split(' support')[0]
-    pr = out.split('Precision/recall on test set: ')[1].split(' support')[0].strip()
-    p, r = pr.split('/')
-    p = float(p.strip('%').strip()) / 100
-    r = float(r.strip('%').strip()) / 100
-
-    return c, p, r
-
-
-def hdfs_copy_data(home_dir, method, version):
-    '''
-        Run CLI HDFS commands to clean up and save data.
-    '''
-    os.system('hdfs dfs -rm /data/shared/structdata/modelOutput/{0}/{1}/scores'.format(method, version))
-    os.system('hdfs dfs -rm /data/shared/structdata/modelOutput/{0}/{1}/alphas'.format(method, version))
-
-    os.system('hdfs dfs -mkdir /data/shared/structdata/modelOutput/{0}/{1}'.format(method, version))
-
-    os.system('hdfs dfs -copyFromLocal {0}/data/eval/{1}/{2}/alphas \
-            /data/shared/structdata/modelOutput/{3}/{4}/alphas'.format(home_dir, version, method, method, version))
-
-    os.system('hdfs dfs -copyFromLocal {0}/data/eval/{1}/{2}/eval \
-            /data/shared/structdata/modelOutput/{3}/{4}/scores'.format(home_dir, version, method, method, version))
-
-def calculate_scores(list_of_scores):
-    '''
-        Calculate the mean of a given list of scores,
-        taking care of any nan or 0 division.
-    '''
-    c, score = 0, 0
-    for i in list_of_scores:
-        if not math.isnan(i):
-            c += 1
-            score += i
-    if c > 0:
-        return score / c
-    else:
-        return 0
-
-
-def calculate_f1(precision, recall):
-    '''
-        Calculates the f1-score as the harmonic
-        mean of precision and recall.
-    '''
-    if precision + recall < 1:
-        return 0
-    else:
-        return  2 / (1/precision + 1/recall)
-
-
-if __name__ == '__main__':
-
-    # Grab the CLI arguments.
-    METHOD, VERSION = usage()
-
-    # Setup output dirs.
-    home_dir = os.path.join('/home', getpass.getuser())
-    final_dir = os.path.join(home_dir, 'data/eval', VERSION, METHOD)
-    final_alphas = os.path.join(final_dir, 'alphas')
-    final_eval = os.path.join(final_dir, 'eval')
-
-    delete_dir(final_alphas)
-    delete_dir(final_eval)
-    create_dir(final_dir)
-
-    # Loop over the attributes and features.
-    training_data_dir = os.path.join(home_dir, 'data/training_data/', VERSION, METHOD)
-
-    for attribute in os.listdir(training_data_dir):
-
-        attribute_path = os.path.join(training_data_dir, attribute)
-        counts = 0
-        precision, recall = [], []
-
-        for feature in os.listdir(attribute_path):
-
-            # Create all the paths in use.
-            out = os.path.join(VERSION, METHOD, attribute, feature)
-            svmlight = os.path.join(home_dir,'data/models/svmlight', out)
-            svml_model =  os.path.join(svmlight, 'model')
-            svml_eval = os.path.join(svmlight, 'eval')
-            svml_alpha = os.path.join(svmlight, 'alphas')
-            test_data = os.path.join(attribute_path, feature, 'test.dat')
-
-            # Run svm_classify.
-            out, err = run_svm_classify(test_data, svml_model, svml_eval)
-
-            # Save current results.
-            paste_data(test_data, svml_eval, final_eval, svml_alpha, final_alphas, out)
-
-            # Parse output from svm_classify to print to stdout.
-            if err:
-                print('Error: {0}'.format(err))
-
-            # Get Train counts, Test counts, Accuracy, Precision, Recall.
-            c, p ,r = parse_svmlight_output(out)
-
-            counts += int(c)
-            precision.append(p)
-            recall.append(r)
-
-        attribute_precision = calculate_scores(precision)
-        attribute_recall = calculate_scores(recall)
-        attribute_f1 = calculate_f1(attribute_precision, attribute_recall)
-
-        print("{: <20} Counts: {: <20} Precision: {: <20} Recall: {: <20} F1-score: {: <20}".format(attribute.title(), \
-            counts, round(attribute_precision, 4), round(attribute_recall, 4), round(attribute_f1, 4)))
-
-
-    # Copying results from remote hdfs.
-    print("\nCopying results to hdfs")
-    hdfs_copy_data(home_dir, METHOD, VERSION)
-    print("\nDone!".format())