Corrected all minus report

UrbsLab · May 14, 2024 · cbb01d7 · cbb01d7
1 parent 8a912fc
commit cbb01d7
Show file tree

Hide file tree

Showing 23 changed files with 344 additions and 317 deletions.
diff --git a/streamline/dataprep/data_process.py b/streamline/dataprep/data_process.py
@@ -67,10 +67,19 @@ def __init__(self, dataset, experiment_path, ignore_features=None,
         """
         super().__init__()
         if type(dataset) != Dataset:
-            raise (Exception("dataset input is not of type Dataset"))
-        self.dataset = dataset
-        self.outcome_type = dataset.outcome_type
-        self.dataset_path = dataset.path
+            try:
+                assert(type(dataset) == tuple)
+                dataset_path, outcome_label, match_label, instance_label, outcome_type =  dataset
+                self.dataset = Dataset(dataset_path, outcome_label, match_label, instance_label, outcome_type, load_data=False)
+                self.load_data = False
+                self.dataset_path = dataset.path
+            except Exception as e:
+                raise (Exception("dataset input is invalid " + str(e)))
+        else:
+            self.dataset = dataset
+            self.outcome_type = dataset.outcome_type
+            self.dataset_path = dataset.path
+            self.load_data = True
         self.experiment_path = experiment_path
         self.random_state = random_state
 
@@ -164,6 +173,9 @@ def run(self, top_features=20):
 
         """
         self.job_start_time = time.time()
+        if not self.load_data:
+            self.dataset.load_data()
+            self.outcome_type = self.dataset.outcome_type
 
         # Conduct Exploratory Analysis, Data Cleaning, and Feature Engineering
         self.run_process(top_features)

diff --git a/streamline/legacy/CompareJobSubmit.py b/streamline/legacy/CompareJobSubmit.py
@@ -10,14 +10,10 @@
 
 
 def run_cluster(argv):
-    output_path = argv[1]
-    experiment_name = argv[2]
-    experiment_path = argv[3] if argv[3] != "None" else None
-    outcome_type = argv[4]
-    outcome_label = argv[5]
-    instance_label = argv[6] if argv[6] != "None" else None
-    sig_cutoff = float(argv[7])
-    show_plots = eval(argv[8])
+    param_path = argv[1]
+    with open(param_path, "rb") as input_file:
+        params = pickle.load(input_file)
+    globals().update(params)
 
     job_obj = CompareJob(output_path, experiment_name, experiment_path,
                          outcome_label, outcome_type, instance_label, sig_cutoff, show_plots)

diff --git a/streamline/legacy/DataJobSubmit.py b/streamline/legacy/DataJobSubmit.py
@@ -1,5 +1,6 @@
 import os
 import sys
+import pickle
 from pathlib import Path
 
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -9,16 +10,12 @@
 
 
 def run_cluster(argv):
-    cv_train_path = argv[1]
-    cv_test_path = argv[2]
-    full_path = argv[3]
-    scale_data = eval(argv[4])
-    impute_data = eval(argv[5])
-    multi_impute = eval(argv[6])
-    overwrite_cv = eval(argv[7])
-    outcome_label = argv[8] if argv[8] != "None" else None
-    instance_label = argv[9] if argv[9] != "None" else None
-    random_state = int(argv[10]) if argv[10] != "None" else None
+    param_path = argv[1]
+    with open(param_path, "rb") as input_file:
+        params = pickle.load(input_file)
+    globals().update(params)
+    full_path = output_path + "/" + experiment_name
+
 
     job_obj = ScaleAndImpute(cv_train_path, cv_test_path,
                              full_path,

diff --git a/streamline/legacy/EDAJobSubmit.py b/streamline/legacy/EDAJobSubmit.py
@@ -9,21 +9,61 @@
 from streamline.utils.dataset import Dataset
 from streamline.dataprep.data_process import DataProcess
 
+class dotdict(dict):
+    def __getattr__(self, key):
+        return self[key]
+
+    def __setattr__(self, key, value):
+        self[key] = value
+
+def save_metadata(self):
+        metadata = dict()
+        metadata['Data Path'] = self.data_path
+        metadata['Output Path'] = self.output_path
+        metadata['Experiment Name'] = self.experiment_name
+        metadata['Outcome Label'] = self.outcome_label
+        metadata['Outcome Type'] = self.outcome_type
+        metadata['Instance Label'] = self.instance_label
+        metadata['Match Label'] = self.match_label
+        metadata['Ignored Features'] = self.ignore_features
+        metadata['Specified Categorical Features'] = self.categorical_features
+        metadata['Specified Quantitative Features'] = self.quantitative_features
+        metadata['CV Partitions'] = self.n_splits
+        metadata['Partition Method'] = self.partition_method
+        metadata['Categorical Cutoff'] = self.categorical_cutoff
+        metadata['Statistical Significance Cutoff'] = self.sig_cutoff
+        metadata['Engineering Missingness Cutoff'] = self.featureeng_missingness
+        metadata['Cleaning Missingness Cutoff'] = self.cleaning_missingness
+        metadata['Correlation Removal Threshold'] = self.correlation_removal_threshold
+        metadata['List of Exploratory Analysis Ran'] = self.exploration_list
+        metadata['List of Exploratory Plots Saved'] = self.plot_list
+        metadata['Random Seed'] = self.random_state
+        metadata['Run From Notebook'] = self.show_plots
+        # Pickle the metadata for future use
+        pickle_out = open(self.output_path + '/' + self.experiment_name + '/' + "metadata.pickle", 'wb')
+        pickle.dump(metadata, pickle_out)
+        pickle_out.close()
+
+
 def run_cluster(argv):
     param_path = argv[1]
     with open(param_path, "rb") as input_file:
         params = pickle.load(input_file)
     globals().update(params)
+    try:
+        dataset = Dataset(dataset_path, outcome_label, match_label, instance_label, outcome_type)
+    except Exception:
+        dataset = (dataset_path, outcome_label, match_label, instance_label, outcome_type)
+        save_metadata(dotdict(params))
 
-
-    dataset = Dataset(dataset_path, outcome_label, match_label, instance_label, outcome_type)
     eda_obj = DataProcess(dataset, output_path + '/' + experiment_name,
                           ignore_features,
                           categorical_features, quantitative_features, exclude_eda_output,
                           categorical_cutoff, sig_cutoff, featureeng_missingness,
                           cleaning_missingness, correlation_removal_threshold, partition_method, n_splits,
                           random_state)
     eda_obj.run(top_features)
+
 
 
 if __name__ == "__main__":

diff --git a/streamline/legacy/FImpJobSubmit.py b/streamline/legacy/FImpJobSubmit.py
@@ -1,5 +1,6 @@
 import os
 import sys
+import pickle
 from pathlib import Path
 
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -9,16 +10,10 @@
 
 
 def run_cluster(argv):
-    cv_train_path = argv[1]
-    experiment_path = argv[2]
-    outcome_label = argv[3]
-    instance_label = argv[4] if argv[4] != "None" else None
-    instance_subset = None if argv[5] == "None" else eval(argv[5])
-    algorithm = argv[6]
-    use_turf = eval(argv[7])
-    turf_pct = eval(argv[8])
-    random_state = None if argv[9] == "None" else int(argv[9])
-    n_jobs = None if argv[10] == "None" else int(argv[10])
+    param_path = argv[1]
+    with open(param_path, "rb") as input_file:
+        params = pickle.load(input_file)
+    globals().update(params)
 
     job_obj = FeatureImportance(cv_train_path, experiment_path, outcome_label,
                                 instance_label, instance_subset, algorithm,

diff --git a/streamline/legacy/FSelJobSubmit.py b/streamline/legacy/FSelJobSubmit.py
@@ -1,5 +1,6 @@
 import os
 import sys
+import pickle
 from pathlib import Path
 
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -9,18 +10,10 @@
 
 
 def run_cluster(argv):
-    full_path = argv[1]
-    n_datasets = int(argv[2])
-    MI, MS = "MI", "MS"
-    algorithms = None if argv[3] == "None" else eval(argv[3])
-    print(algorithms)
-    outcome_label = argv[4]
-    instance_label = argv[5] if argv[5] != "None" else None
-    export_scores = eval(argv[6])
-    top_features = int(argv[7])
-    max_features_to_keep = int(argv[8])
-    filter_poor_features = eval(argv[9])
-    overwrite_cv = eval(argv[10])
+    param_path = argv[1]
+    with open(param_path, "rb") as input_file:
+        params = pickle.load(input_file)
+    globals().update(params)
 
     job_obj = FeatureSelection(full_path, n_datasets, algorithms,
                                outcome_label, instance_label, export_scores,

diff --git a/streamline/legacy/ModelJobSubmit.py b/streamline/legacy/ModelJobSubmit.py
@@ -7,43 +7,56 @@
 sys.path.append(str(Path(SCRIPT_DIR).parent.parent))
 
 from streamline.modeling.modeljob import ModelJob
-from streamline.modeling.utils import model_str_to_obj
 from streamline.modeling.utils import get_fi_for_ExSTraCS
 
 
 def run_cluster(argv):
-    full_path = argv[1]
-    output_path = argv[2]
-    experiment_name = argv[3]
-    cv_count = int(argv[4])
-    outcome_label = argv[5]
-    outcome_type = argv[6]
-    instance_label = argv[7] if argv[7] != "None" else None
-    scoring_metric = argv[8]
-    metric_direction = argv[9]
-    n_trials = int(argv[10])
-    timeout = int(argv[11])
-    training_subsample = int(argv[12])
-    uniform_fi = eval(argv[13])
-    save_plot = eval(argv[14])
-    random_state = None if argv[15] == "None" else int(argv[15])
-    algorithm = argv[16]
-    n_jobs = None if argv[17] == "None" else int(argv[17])
-    do_lcs_sweep = eval(argv[18])
-    lcs_iterations = int(argv[19])
-    lcs_n = int(argv[20])
-    lcs_nu = int(argv[21])
+    param_path = argv[1]
+    with open(param_path, "rb") as input_file:
+        params = pickle.load(input_file)
+    globals().update(params)
+    print(params)
+    print(vars())
 
+    # if outcome_type == "Binary":
+    #     with GlobalImport() as gi:
+    #         from streamline.modeling.classification_utils import model_str_to_obj
+    #         gi()
+
+    # elif outcome_type == "Continuous":
+    #     if scoring_metric == 'balanced_accuracy':
+    #         scoring_metric = 'explained_variance'
+    #     with GlobalImport() as gi:
+    #         from streamline.modeling.regression_utils import model_str_to_obj
+    #         gi()
+    # elif outcome_type == "Multiclass":
+    #     # logging.info("Using Multiclass Classification Models")
+    #     with GlobalImport() as gi:
+    #         from streamline.modeling.multiclass_utils import model_str_to_obj
+    #         gi()
+    # else:
+    #     raise Exception("Unknown Outcome Type:" + str(outcome_type))
+
     file = open(output_path + '/' + experiment_name + '/' + "metadata.pickle", 'rb')
     metadata = pickle.load(file)
     filter_poor_features = metadata['Filter Poor Features']
+    outcome_type = metadata['Outcome Type']
     file.close()
-
     dataset_directory_path = full_path.split('/')[-1]
 
+    if outcome_type == "Binary":
+        from streamline.modeling.classification_utils import model_str_to_obj
+    elif outcome_type == "Multiclass":
+        from streamline.modeling.multiclass_utils import model_str_to_obj
+    elif outcome_type == "Continuous":
+        from streamline.modeling.regression_utils import model_str_to_obj
+    else:
+        raise Exception("Unknown Outcome Type:" + str(outcome_type))
+
     job_obj = ModelJob(full_path, output_path, experiment_name, cv_count, outcome_label,
                        instance_label, scoring_metric, metric_direction, n_trials,
-                       timeout, training_subsample, uniform_fi, save_plot, random_state)
+                       timeout, training_subsample, uniform_fi, save_plots, random_state)
+
 
     if algorithm not in ['eLCS', 'XCS', 'ExSTraCS']:
         model = model_str_to_obj(algorithm)(cv_folds=3,

diff --git a/streamline/legacy/RepJobSubmit.py b/streamline/legacy/RepJobSubmit.py
@@ -10,45 +10,26 @@
 
 
 def run_cluster(argv):
-    dataset_filename = argv[1]
-    dataset_for_rep = argv[2]
-    full_path = argv[3]
-    outcome_label = argv[4]
-    outcome_type = argv[5]
-    instance_label = argv[6] if argv[6] != "None" else None
-    match_label = argv[7] if argv[7] != "None" else None
-    experiment_path = '/'.join(full_path.split('/')[:-1])
-    file = open(experiment_path + '/' + "algInfo.pickle", 'rb')
-    alg_info = pickle.load(file)
-    file.close()
-    temp_algo = []
-    for key in alg_info:
-        if alg_info[key][0]:
-            temp_algo.append(key)
-    algorithms = temp_algo
-    file = open(experiment_path + '/' + "metadata.pickle", 'rb')
-    metadata = pickle.load(file)
-    file.close()
-    ignore_features = metadata['Ignored Features']
-    exclude = None
-    len_cv = int(argv[9])
-    if argv != 'None':
-        exclude_options = argv[10].split(',')
-        exclude_options = [x.strip() for x in exclude_options]
-    else:
-        exclude_options = None
-    categorical_cutoff = int(argv[11]) if argv[11] != "None" else None
-    sig_cutoff = float(argv[12]) if argv[12] != "None" else None
-    scale_data = eval(argv[13])
-    impute_data = eval(argv[14])
-    multi_impute = eval(argv[15])
-    show_plots = eval(argv[16])
-    scoring_metric = argv[17]
-    random_state = eval(argv[18])
+    param_path = argv[1]
+    with open(param_path, "rb") as input_file:
+        params = pickle.load(input_file)
+    globals().update(params)
+    # file = open(experiment_path + '/' + "algInfo.pickle", 'rb')
+    # alg_info = pickle.load(file)
+    # file.close()
+    # temp_algo = []
+    # for key in alg_info:
+    #     if alg_info[key][0]:
+    #         temp_algo.append(key)
+    # algorithms = temp_algo
+    # file = open(experiment_path + '/' + "metadata.pickle", 'rb')
+    # metadata = pickle.load(file)
+    # file.close()
+    # ignore_features = metadata['Ignored Features']
 
     job_obj = ReplicateJob(dataset_filename, dataset_for_rep, full_path, outcome_label, outcome_type, instance_label,
-                           match_label, ignore_features, len_cv,
-                           exclude_options,
+                           match_label, ignore_features, cv_partitions,
+                           exclude_plots,
                            categorical_cutoff, sig_cutoff, scale_data, impute_data,
                            multi_impute, show_plots, scoring_metric, random_state)
     job_obj.run()

diff --git a/streamline/legacy/ReportJobSubmit.py b/streamline/legacy/ReportJobSubmit.py
@@ -10,12 +10,10 @@
 
 
 def run_cluster(argv):
-    output_path = argv[1]
-    experiment_name = argv[2]
-    experiment_path = None
-    training = eval(argv[3])
-    train_data_path = None if argv[4] == "None" else argv[4]
-    rep_data_path = None if argv[5] == "None" else argv[5]
+    param_path = argv[1]
+    with open(param_path, "rb") as input_file:
+        params = pickle.load(input_file)
+    globals().update(params)
 
     job_obj = ReportJob(output_path, experiment_name, experiment_path,
                         training, train_data_path, rep_data_path)

diff --git a/streamline/legacy/StatsJobSubmit.py b/streamline/legacy/StatsJobSubmit.py
@@ -10,27 +10,14 @@
 
 
 def run_cluster(argv):
-    full_path = argv[1]
-    experiment_path = '/'.join(full_path.split('/')[:-1])
-    outcome_label = argv[2]
-    outcome_type = argv[3]
-    instance_label = argv[4] if argv[4] != "None" else None
-    scoring_metric = argv[5]
-    len_cv = int(argv[6])
-    top_features = int(argv[7]) if argv[7] != "None" else None
-    sig_cutoff = float(argv[8]) if argv[8] != "None" else None
-    metric_weight = argv[9] if argv[9] != "None" else None
-    scale_data = eval(argv[10])
-    if argv[11] != 'None':
-        exclude_options = argv[11].split(',')
-        exclude_options = [x.strip() for x in exclude_options]
-    else:
-        exclude_options = None
-    show_plots = eval(argv[12])
+    param_path = argv[1]
+    with open(param_path, "rb") as input_file:
+        params = pickle.load(input_file)
+    globals().update(params)
 
     job_obj = StatsJob(full_path, outcome_label, outcome_type, instance_label, scoring_metric,
                        len_cv, top_features, sig_cutoff, metric_weight, scale_data,
-                       exclude_options,
+                       exclude_plots,
                        show_plots)
     job_obj.run()