Skip to content

Commit

Permalink
checked and corrected legacy eda runner
Browse files Browse the repository at this point in the history
  • Loading branch information
raptor419 committed May 14, 2024
1 parent a145daa commit 8a912fc
Show file tree
Hide file tree
Showing 6 changed files with 21 additions and 19 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ run_configs/cedars_amd.cfg
data/AMD_Final/*
data/PLCO/*
run_configs/cedars_plco.cfg
test*.cfg
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ xgboost
lightgbm
catboost
gplearn
group-lasso
ipython
fpdf
scikit-XCS
Expand Down
8 changes: 2 additions & 6 deletions streamline/legacy/EDAJobSubmit.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,14 @@
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(str(Path(SCRIPT_DIR).parent.parent))

from streamline.dataprep.data_process import DataProcess
from streamline.dataprep.kfold_partitioning import KFoldPartitioner
from streamline.utils.dataset import Dataset
from streamline.utils.parser_helpers import process_cli_param

from streamline.dataprep.data_process import DataProcess

def run_cluster(argv):
param_path = argv[1]
with open(param_path, "rb") as input_file:
params = pickle.load(input_file)
params = open(param_path)
locals().update(params)
globals().update(params)


dataset = Dataset(dataset_path, outcome_label, match_label, instance_label, outcome_type)
Expand Down
14 changes: 9 additions & 5 deletions streamline/runners/dataprocess_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,13 +274,17 @@ def save_metadata(self):
# return cluster_params

def get_cluster_params(self, dataset_path):
extra_kwargs = locals()
extra_kwargs.pop('self')
job_ref = str(time.time())
params = {}
for param in dir(self):
if not param.startswith("__"):
if not (param.startswith("__") or 'bound method' in str(getattr(self, param))):
params[param] = getattr(self, param)
params[dataset_path] = dataset_path
pickle.dump(params, open(self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_params.pickle', 'wb'))
for param in extra_kwargs:
params[param] = extra_kwargs[param]
with open(self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_params.pickle', 'wb') as f:
pickle.dump(params, f)
return job_ref


Expand All @@ -302,7 +306,7 @@ def submit_slurm_cluster_job(self, dataset_path):

file_path = str(Path(__file__).parent.parent.parent) + "/streamline/legacy" + '/EDAJobSubmit.py'

command = ' '.join(['srun', 'python', file_path] + (self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_params.pickle'))
command = ' '.join(['srun', 'python', file_path] + [self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_params.pickle'])
sh_file.write(command + '\n')
sh_file.close()
os.system('sbatch ' + job_name)
Expand All @@ -324,7 +328,7 @@ def submit_lsf_cluster_job(self, dataset_path):
'/logs/P1_' + job_ref + '.e\n')

file_path = str(Path(__file__).parent.parent.parent) + "/streamline/legacy" + '/EDAJobSubmit.py'
command = ' '.join(['python', file_path] + (self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_params.pickle'))
command = ' '.join(['python', file_path] + [self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_params.pickle'])
sh_file.write(command + '\n')
sh_file.close()
os.system('bsub < ' + job_name)
4 changes: 2 additions & 2 deletions streamline/utils/parser_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ def str2bool(v):

def save_config(output_path, experiment_name, config_dict):
if not os.path.exists(config_dict['output_path']):
os.mkdir(str(config_dict['output_path']))
os.makedirs(str(config_dict['output_path']))
if not os.path.exists(str(config_dict['output_path']) + '/' + config_dict['experiment_name']):
os.mkdir(str(config_dict['output_path']) + '/' + str(config_dict['experiment_name']))
os.makedirs(str(config_dict['output_path']) + '/' + str(config_dict['experiment_name']))
with open(output_path + '/' + experiment_name + '/runparams.pickle', 'wb') as file:
pickle.dump(config_dict, file, protocol=pickle.HIGHEST_PROTOCOL)

Expand Down
12 changes: 6 additions & 6 deletions test.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,20 @@ dataset_for_rep = './data/DemoData/hcc_data_custom.csv'

[essential run parameters - phases to run - phases 1-9]
# If True, automatically runs all phases below up until and including do_report, automatically running 'compare_dataset' only if multiple target datasets included
do_till_report = True
do_till_report = False

# Individual phases (do_report and do_rep_report are both part of phase 9)
do_eda = False
do_eda = True
do_dataprep = False
do_feat_imp = False
do_feat_sel = False
do_model = False
do_stats = False
do_compare_dataset = False
do_report = False
do_replicate = True
do_rep_report = True
do_cleanup = True
do_replicate = False
do_rep_report = False
do_cleanup = False

[general - phase 1]
cv_partitions = 3
Expand Down Expand Up @@ -102,7 +102,7 @@ del_old_cv = False

[multiprocessing]
run_parallel = True
run_cluster = False
run_cluster = "SLURMOld"
reserved_memory = 4
queue = 'defq'

Expand Down

0 comments on commit 8a912fc

Please sign in to comment.