mirror of
https://github.com/autistic-symposium/shell-whiz-toolkit.git
synced 2025-05-10 10:44:59 -04:00
226 lines
6.5 KiB
Python
226 lines
6.5 KiB
Python
#!/usr/bin/env python
|
|
|
|
"""
|
|
Run svm_light, parse its stdout, calculate
|
|
ML scores, HDFS copy data to local.
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import getpass
|
|
import subprocess
|
|
import shutil
|
|
import math
|
|
|
|
|
|
def delete_dir(dir_path):
|
|
'''
|
|
Remove a directory.
|
|
|
|
Args:
|
|
dir_path: full path to the directory.
|
|
'''
|
|
if os.path.isdir(dir_path):
|
|
shutil.rmtree(dir_path)
|
|
|
|
|
|
def usage():
|
|
'''
|
|
Handle the CLI arguments.
|
|
'''
|
|
args = sys.argv
|
|
if len(args) != 3:
|
|
print("Usage: ./runEval <method> <version>")
|
|
sys.exit(2)
|
|
return args[1], args[2]
|
|
|
|
|
|
def create_dir(dir_path):
|
|
'''
|
|
Create a a directory.
|
|
|
|
Args:
|
|
dir_path: full path to the directory.
|
|
'''
|
|
if not os.path.exists(dir_path):
|
|
os.makedirs(dir_path)
|
|
|
|
|
|
def run_svm_classify(test_data, svml_model, svml_eval):
|
|
'''
|
|
Spawn a subprocess to run svm_classify binary.
|
|
|
|
From svm_classify.c, svm_light usage requires the following
|
|
arguments: example_file model_file output_file.
|
|
|
|
Args:
|
|
test_data: path_to_feature/test.dat
|
|
svml_model: something like ~/data/models/svmlight/method/version/model
|
|
svml_eval: something like ~/data/models/svmlight/method/version/eval
|
|
|
|
Returns:
|
|
Strings with stdout and stderr so that it can be parsed later.
|
|
'''
|
|
p = subprocess.Popen(['./models/svm_classify', \
|
|
'{0}'.format(test_data), \
|
|
'{0}'.format(svml_model),\
|
|
'{0}'.format(svml_eval)],\
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE)
|
|
out, err = p.communicate()
|
|
return out, err
|
|
|
|
|
|
def paste_data(test_data, svml_eval, final_eval, svml_alpha, final_alphas, out):
|
|
'''
|
|
Copy all eval and alpha data from results to local files.
|
|
|
|
Args:
|
|
src and dst paths.
|
|
'''
|
|
# Copy all eval data.
|
|
with open(test_data, 'r') as ft:
|
|
test_data = ft.readlines()
|
|
|
|
with open(svml_eval, 'r') as fe:
|
|
eval_data = fe.readlines()
|
|
|
|
with open(final_eval, 'a') as f:
|
|
for line in test_data:
|
|
f.write('{0}\n'.format(line))
|
|
for line in eval_data:
|
|
f.write('{0}\n'.format(line))
|
|
|
|
# Copy all alpha data.
|
|
with open(svml_alpha, 'r') as fa:
|
|
alpha_data = fa.readlines()
|
|
|
|
with open(final_alphas, 'a') as f:
|
|
for line in alpha_data:
|
|
f.write('{0} {1}\n'.format(line, out))
|
|
|
|
|
|
def parse_svmlight_output(out):
|
|
'''
|
|
Parse the svm_light stdout string for an example
|
|
|
|
Returns:
|
|
c: counts
|
|
p: precision
|
|
r: recall
|
|
'''
|
|
c = out.split('OK. (')[1].split(' support')[0]
|
|
pr = out.split('Precision/recall on test set: ')[1].split(' support')[0].strip()
|
|
p, r = pr.split('/')
|
|
p = float(p.strip('%').strip()) / 100
|
|
r = float(r.strip('%').strip()) / 100
|
|
|
|
return c, p, r
|
|
|
|
|
|
def hdfs_copy_data(home_dir, method, version):
|
|
'''
|
|
Run CLI HDFS commands to clean up and save data.
|
|
'''
|
|
os.system('hdfs dfs -rm /data/shared/structdata/modelOutput/{0}/{1}/scores'.format(method, version))
|
|
os.system('hdfs dfs -rm /data/shared/structdata/modelOutput/{0}/{1}/alphas'.format(method, version))
|
|
|
|
os.system('hdfs dfs -mkdir /data/shared/structdata/modelOutput/{0}/{1}'.format(method, version))
|
|
|
|
os.system('hdfs dfs -copyFromLocal {0}/data/eval/{1}/{2}/alphas \
|
|
/data/shared/structdata/modelOutput/{3}/{4}/alphas'.format(home_dir, version, method, method, version))
|
|
|
|
os.system('hdfs dfs -copyFromLocal {0}/data/eval/{1}/{2}/eval \
|
|
/data/shared/structdata/modelOutput/{3}/{4}/scores'.format(home_dir, version, method, method, version))
|
|
|
|
def calculate_scores(list_of_scores):
|
|
'''
|
|
Calculate the mean of a given list of scores,
|
|
taking care of any nan or 0 division.
|
|
'''
|
|
c, score = 0, 0
|
|
for i in list_of_scores:
|
|
if not math.isnan(i):
|
|
c += 1
|
|
score += i
|
|
if c > 0:
|
|
return score / c
|
|
else:
|
|
return 0
|
|
|
|
|
|
def calculate_f1(precision, recall):
|
|
'''
|
|
Calculates the f1-score as the harmonic
|
|
mean of precision and recall.
|
|
'''
|
|
if precision + recall < 1:
|
|
return 0
|
|
else:
|
|
return 2 / (1/precision + 1/recall)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
# Grab the CLI arguments.
|
|
METHOD, VERSION = usage()
|
|
|
|
# Setup output dirs.
|
|
home_dir = os.path.join('/home', getpass.getuser())
|
|
final_dir = os.path.join(home_dir, 'data/eval', VERSION, METHOD)
|
|
final_alphas = os.path.join(final_dir, 'alphas')
|
|
final_eval = os.path.join(final_dir, 'eval')
|
|
|
|
delete_dir(final_alphas)
|
|
delete_dir(final_eval)
|
|
create_dir(final_dir)
|
|
|
|
# Loop over the attributes and features.
|
|
training_data_dir = os.path.join(home_dir, 'data/training_data/', VERSION, METHOD)
|
|
|
|
for attribute in os.listdir(training_data_dir):
|
|
|
|
attribute_path = os.path.join(training_data_dir, attribute)
|
|
counts = 0
|
|
precision, recall = [], []
|
|
|
|
for feature in os.listdir(attribute_path):
|
|
|
|
# Create all the paths in use.
|
|
out = os.path.join(VERSION, METHOD, attribute, feature)
|
|
svmlight = os.path.join(home_dir,'data/models/svmlight', out)
|
|
svml_model = os.path.join(svmlight, 'model')
|
|
svml_eval = os.path.join(svmlight, 'eval')
|
|
svml_alpha = os.path.join(svmlight, 'alphas')
|
|
test_data = os.path.join(attribute_path, feature, 'test.dat')
|
|
|
|
# Run svm_classify.
|
|
out, err = run_svm_classify(test_data, svml_model, svml_eval)
|
|
|
|
# Save current results.
|
|
paste_data(test_data, svml_eval, final_eval, svml_alpha, final_alphas, out)
|
|
|
|
# Parse output from svm_classify to print to stdout.
|
|
if err:
|
|
print('Error: {0}'.format(err))
|
|
|
|
# Get Train counts, Test counts, Accuracy, Precision, Recall.
|
|
c, p ,r = parse_svmlight_output(out)
|
|
|
|
counts += int(c)
|
|
precision.append(p)
|
|
recall.append(r)
|
|
|
|
attribute_precision = calculate_scores(precision)
|
|
attribute_recall = calculate_scores(recall)
|
|
attribute_f1 = calculate_f1(attribute_precision, attribute_recall)
|
|
|
|
print("{: <20} Counts: {: <20} Precision: {: <20} Recall: {: <20} F1-score: {: <20}".format(attribute.title(), \
|
|
counts, round(attribute_precision, 4), round(attribute_recall, 4), round(attribute_f1, 4)))
|
|
|
|
|
|
# Copying results from remote hdfs.
|
|
print("\nCopying results to hdfs")
|
|
hdfs_copy_data(home_dir, METHOD, VERSION)
|
|
print("\nDone!".format())
|
|
|