forked from EpistasisLab/LPC
-
Notifications
You must be signed in to change notification settings - Fork 2
/
ExploratoryAnalysisMain.py
167 lines (146 loc) · 9.79 KB
/
ExploratoryAnalysisMain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import sys
import os
import argparse
import glob
import ExploratoryAnalysisJob
import time
import csv
'''Phase 1 of Machine Learning Analysis Pipeline:
Sample Run Command:
python ExploratoryAalysisMain.py --data-path /Users/robert/Desktop/Datasets --output-path /Users/robert/Desktop/outputs --experiment-name test1
'''
def main(argv):
#Parse arguments
parser = argparse.ArgumentParser(description="")
#No defaults
parser.add_argument('--data-path',dest='data_path',type=str,help='path to directory containing datasets')
parser.add_argument('--output-path',dest='output_path',type=str,help='path to output directory')
parser.add_argument('--experiment-name', dest='experiment_name',type=str, help='name of experiment output folder (no spaces)')
#Defaults available (but critical to check)
parser.add_argument('--class-label', dest='class_label', type=str, help='outcome label of all datasets', default="Class")
parser.add_argument('--instance-label', dest='instance_label', type=str, default="")
#Defaults available (but less critical to check)
parser.add_argument('--cv',dest='cv_partitions',type=int,help='number of CV partitions',default=10)
parser.add_argument('--partition-method',dest='partition_method',type=str,help='S or R or M for stratified, random, or matched, respectively',default="S")
parser.add_argument('--match-label', dest='match_label', type=str, help='only applies when M selected for partition-method; indicates column with matched instance ids', default="")
parser.add_argument('--categorical-cutoff', dest='categorical_cutoff', type=int,help='number of unique values after which a variable is considered to be quantitative vs categorical', default=10)
parser.add_argument('--export-ea', dest='export_exploratory_analysis', type=str, help='run and export basic exploratory analysis files, i.e. unique value counts, missingness counts, class balance barplot',default="True")
parser.add_argument('--export-fc', dest='export_feature_correlations', type=str, help='run and export feature correlation analysis (yields correlation heatmap)',default="True")
parser.add_argument('--export-up', dest='export_univariate_plots', type=str, help='export univariate analysis plots (note: univariate analysis still output by default)',default="True")
parser.add_argument('--random-state', dest='random_state', type=int, help='sets a specific random seed for reproducible results',default=42)
parser.add_argument('--run-parallel',dest='run_parallel',type=str,help='path to directory containing datasets',default="True")
parser.add_argument('--res-mem', dest='reserved_memory', type=int, help='reserved memory for the job (in Gigabytes)',default=4)
parser.add_argument('--max-mem', dest='maximum_memory', type=int, help='maximum memory before the job is automatically terminated',default=15)
parser.add_argument('-c','--do-check',dest='do_check', help='Boolean: Specify whether to check for existence of all output files.', action='store_true')
options = parser.parse_args(argv[1:])
data_path = options.data_path
output_path = options.output_path
experiment_name = options.experiment_name
class_label = options.class_label
if options.instance_label == '':
instance_label = 'None'
else:
instance_label = options.instance_label
cv_partitions = options.cv_partitions
partition_method = options.partition_method
if options.match_label == '':
match_label = 'None'
else:
match_label = options.match_label
categorical_cutoff = options.categorical_cutoff
export_exploratory_analysis = options.export_exploratory_analysis
export_feature_correlations = options.export_feature_correlations
export_univariate_plots = options.export_univariate_plots
random_state = options.random_state
run_parallel = options.run_parallel
reserved_memory = options.reserved_memory
maximum_memory = options.maximum_memory
do_check = options.do_check
if not do_check:
#Check to make sure data_path exists and experiment name is valid & unique
if not os.path.exists(data_path):
raise Exception("Provided data_path does not exist")
if os.path.exists(output_path+'/'+experiment_name):
raise Exception("Experiment Name must be unique")
for char in experiment_name:
if not char in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_':
raise Exception('Experiment Name must be alphanumeric')
#Create output folder if it doesn't already exist
if not os.path.exists(output_path):
os.mkdir(output_path)
#Create Experiment folder, with log and job folders
os.mkdir(output_path+'/'+experiment_name)
os.mkdir(output_path+'/'+experiment_name+'/jobs')
os.mkdir(output_path+'/'+experiment_name+'/logs')
os.mkdir(output_path+'/'+ experiment_name+'/jobsCompleted')
#Determine file extension of datasets in target folder:
file_count = 0
unique_datanames = []
for datasetFilename in glob.glob(data_path+'/*'):
file_extension = datasetFilename.split('/')[-1].split('.')[-1]
data_name = datasetFilename.split('/')[-1].split('.')[0] #Save unique dataset names so that analysis is run only once if there is both a .txt and .csv version of dataset with same name.
if file_extension == 'txt' or file_extension == 'csv':
if data_name not in unique_datanames:
unique_datanames.append(data_name)
if run_parallel:
submitClusterJob(datasetFilename,output_path+'/'+experiment_name,cv_partitions,partition_method,categorical_cutoff,export_exploratory_analysis,export_feature_correlations,export_univariate_plots,class_label,instance_label,match_label,random_state,reserved_memory,maximum_memory)
else:
submitLocalJob(datasetFilename,output_path+'/'+experiment_name,cv_partitions,partition_method,categorical_cutoff,export_exploratory_analysis,export_feature_correlations,export_univariate_plots,class_label,instance_label,match_label,random_state)
file_count += 1
if file_count == 0: #Check that there was at least 1 dataset
raise Exception("There must be at least one .txt or .csv dataset in data_path directory")
# Save metadata to file
with open(output_path+'/'+experiment_name+'/'+'metadata.csv',mode='w') as file:
writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow(["DATA LABEL", "VALUE"])
writer.writerow(["class label",class_label])
writer.writerow(["instance label", instance_label])
writer.writerow(["random state",random_state])
writer.writerow(["categorical cutoff",categorical_cutoff])
writer.writerow(["cv partitions",cv_partitions])
file.close()
else: #run job checks
datasets = os.listdir(output_path + "/" + experiment_name)
datasets.remove('logs')
datasets.remove('jobs')
datasets.remove('jobsCompleted')
if 'metadata.csv' in datasets:
datasets.remove('metadata.csv')
if 'DatasetComparisons' in datasets:
datasets.remove('DatasetComparisons')
phase1Jobs = []
for dataset in datasets:
phase1Jobs.append('job_exploratory_'+dataset+'.txt')
for filename in glob.glob(output_path + "/" + experiment_name+'/jobsCompleted/job_exploratory*'):
ref = filename.split('/')[-1]
phase1Jobs.remove(ref)
for job in phase1Jobs:
print(job)
if len(phase1Jobs) == 0:
print("All Phase 1 Jobs Completed")
else:
print("Above Phase 1 Jobs Not Completed")
print()
def submitLocalJob(dataset_path,experiment_path,cv_partitions,partition_method,categorical_cutoff,export_exploratory_analysis,export_feature_correlations,export_univariate_plots,class_label,instance_label,match_label,random_state):
DataPreprocessingJob.job(dataset_path,experiment_path,cv_partitions,partition_method,categorical_cutoff,export_exploratory_analysis,export_feature_correlations,export_univariate_plots,class_label,instance_label,match_label,random_state)
def submitClusterJob(dataset_path,experiment_path,cv_partitions,partition_method,categorical_cutoff,export_exploratory_analysis,export_feature_correlations,export_univariate_plots,class_label,instance_label,match_label,random_state,reserved_memory,maximum_memory):
job_ref = str(time.time())
job_name = experiment_path+'/jobs/P1_'+job_ref+'_run.sh'
sh_file = open(job_name,'w')
sh_file.write('#!/bin/bash\n')
sh_file.write('#BSUB -q i2c2_normal'+'\n') #doi_normal
sh_file.write('#BSUB -J '+job_ref+'\n')
sh_file.write('#BSUB -R "rusage[mem='+str(reserved_memory)+'G]"'+'\n')
sh_file.write('#BSUB -M '+str(maximum_memory)+'GB'+'\n')
sh_file.write('#BSUB -o ' + experiment_path+'/logs/P1_'+job_ref+'.o\n')
sh_file.write('#BSUB -e ' + experiment_path+'/logs/P1_'+job_ref+'.e\n')
this_file_path = os.path.dirname(os.path.realpath(__file__))
sh_file.write('python '+this_file_path+'/ExploratoryAnalysisJob.py '+dataset_path+" "+experiment_path+" "+str(cv_partitions)+
" "+partition_method+" "+str(categorical_cutoff)+" "+export_exploratory_analysis+
" "+export_feature_correlations+" "+export_univariate_plots+" "+class_label+" "+instance_label+" "+match_label+
" "+str(random_state)+'\n')
sh_file.close()
os.system('bsub < '+job_name)
pass
if __name__ == '__main__':
sys.exit(main(sys.argv))