Added comments to the basemodels, submodels, modelutils, (“modeling” …

…folder) and legacy phase of streamline (in "legacy" folder)
UrbsLab · May 17, 2024 · fb67832 · fb67832
1 parent cbb01d7
commit fb67832
Show file tree

Hide file tree

Showing 18 changed files with 367 additions and 237 deletions.
diff --git a/streamline/legacy/CompareJobSubmit.py b/streamline/legacy/CompareJobSubmit.py
@@ -3,22 +3,31 @@
 import pickle
 from pathlib import Path
 
+# Determine the directory where the script is located
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+# Add the grandparent directory to the system path to allow importing modules from there
 sys.path.append(str(Path(SCRIPT_DIR).parent.parent))
 
+# Import the CompareJob class from the specified module
 from streamline.postanalysis.dataset_compare import CompareJob
 
-
 def run_cluster(argv):
+    # The first argument is expected to be the path to a parameters file
     param_path = argv[1]
+
+    # Load the parameters from the specified file using pickle
     with open(param_path, "rb") as input_file:
         params = pickle.load(input_file)
+
+    # Update the global namespace with the loaded parameters
     globals().update(params)
 
+    # Instantiate the CompareJob class with the loaded parameters
     job_obj = CompareJob(output_path, experiment_name, experiment_path,
                          outcome_label, outcome_type, instance_label, sig_cutoff, show_plots)
+    # Run the job
     job_obj.run()
 
-
+# If the script is run as the main module, execute the run_cluster function
 if __name__ == "__main__":
     sys.exit(run_cluster(sys.argv))
diff --git a/streamline/legacy/DataJobSubmit.py b/streamline/legacy/DataJobSubmit.py
@@ -3,26 +3,38 @@
 import pickle
 from pathlib import Path
 
+# Determine the directory where the script is located
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+# Add the grandparent directory of the script to the system path
+# This allows importing modules from two levels up
 sys.path.append(str(Path(SCRIPT_DIR).parent.parent))
 
+# Import the ScaleAndImpute class from the streamline.dataprep.scale_and_impute module
 from streamline.dataprep.scale_and_impute import ScaleAndImpute
 
 
 def run_cluster(argv):
+    # Get the path to the parameter file from the command line arguments
     param_path = argv[1]
+    # Open the parameter file in binary read mode
     with open(param_path, "rb") as input_file:
+        # Load the parameters from the file using pickle
         params = pickle.load(input_file)
+    # Update the global variables with the parameters from the file
     globals().update(params)
+    # Construct the full output path for the experiment
     full_path = output_path + "/" + experiment_name
 
-
+    # Create an instance of the ScaleAndImpute class with the loaded parameters
     job_obj = ScaleAndImpute(cv_train_path, cv_test_path,
                              full_path,
                              scale_data, impute_data, multi_impute, overwrite_cv,
                              outcome_label, instance_label, random_state)
+    # Run the scaling and imputation process
     job_obj.run()
 
 
 if __name__ == "__main__":
+    # Execute the run_cluster function with command line arguments
+    # and exit the program with the return value of run_cluster
     sys.exit(run_cluster(sys.argv))
diff --git a/streamline/legacy/EDAJobSubmit.py b/streamline/legacy/EDAJobSubmit.py
@@ -3,68 +3,83 @@
 import pickle
 from pathlib import Path
 
+# Determine the directory where the script is located
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+# Add the grandparent directory of the script to the system path
+# This allows importing modules from two levels up
 sys.path.append(str(Path(SCRIPT_DIR).parent.parent))
 
+# Import Dataset and DataProcess classes from the streamline package
 from streamline.utils.dataset import Dataset
 from streamline.dataprep.data_process import DataProcess
 
+# Define a custom dictionary class with dot notation access for attributes
 class dotdict(dict):
     def __getattr__(self, key):
         return self[key]
 
     def __setattr__(self, key, value):
         self[key] = value
 
+# Function to save metadata related to data processing
 def save_metadata(self):
-        metadata = dict()
-        metadata['Data Path'] = self.data_path
-        metadata['Output Path'] = self.output_path
-        metadata['Experiment Name'] = self.experiment_name
-        metadata['Outcome Label'] = self.outcome_label
-        metadata['Outcome Type'] = self.outcome_type
-        metadata['Instance Label'] = self.instance_label
-        metadata['Match Label'] = self.match_label
-        metadata['Ignored Features'] = self.ignore_features
-        metadata['Specified Categorical Features'] = self.categorical_features
-        metadata['Specified Quantitative Features'] = self.quantitative_features
-        metadata['CV Partitions'] = self.n_splits
-        metadata['Partition Method'] = self.partition_method
-        metadata['Categorical Cutoff'] = self.categorical_cutoff
-        metadata['Statistical Significance Cutoff'] = self.sig_cutoff
-        metadata['Engineering Missingness Cutoff'] = self.featureeng_missingness
-        metadata['Cleaning Missingness Cutoff'] = self.cleaning_missingness
-        metadata['Correlation Removal Threshold'] = self.correlation_removal_threshold
-        metadata['List of Exploratory Analysis Ran'] = self.exploration_list
-        metadata['List of Exploratory Plots Saved'] = self.plot_list
-        metadata['Random Seed'] = self.random_state
-        metadata['Run From Notebook'] = self.show_plots
-        # Pickle the metadata for future use
-        pickle_out = open(self.output_path + '/' + self.experiment_name + '/' + "metadata.pickle", 'wb')
-        pickle.dump(metadata, pickle_out)
-        pickle_out.close()
-
+    metadata = dict()
+    # Populate the metadata dictionary with relevant attributes
+    metadata['Data Path'] = self.data_path
+    metadata['Output Path'] = self.output_path
+    metadata['Experiment Name'] = self.experiment_name
+    metadata['Outcome Label'] = self.outcome_label
+    metadata['Outcome Type'] = self.outcome_type
+    metadata['Instance Label'] = self.instance_label
+    metadata['Match Label'] = self.match_label
+    metadata['Ignored Features'] = self.ignore_features
+    metadata['Specified Categorical Features'] = self.categorical_features
+    metadata['Specified Quantitative Features'] = self.quantitative_features
+    metadata['CV Partitions'] = self.n_splits
+    metadata['Partition Method'] = self.partition_method
+    metadata['Categorical Cutoff'] = self.categorical_cutoff
+    metadata['Statistical Significance Cutoff'] = self.sig_cutoff
+    metadata['Engineering Missingness Cutoff'] = self.featureeng_missingness
+    metadata['Cleaning Missingness Cutoff'] = self.cleaning_missingness
+    metadata['Correlation Removal Threshold'] = self.correlation_removal_threshold
+    metadata['List of Exploratory Analysis Ran'] = self.exploration_list
+    metadata['List of Exploratory Plots Saved'] = self.plot_list
+    metadata['Random Seed'] = self.random_state
+    metadata['Run From Notebook'] = self.show_plots
+    # Pickle the metadata for future use
+    pickle_out = open(self.output_path + '/' + self.experiment_name + '/' + "metadata.pickle", 'wb')
+    pickle.dump(metadata, pickle_out)
+    pickle_out.close()
 
+# Main function to run clustering analysis
 def run_cluster(argv):
+    # Get the path to the parameter file from the command line arguments
     param_path = argv[1]
+    # Open the parameter file in binary read mode
     with open(param_path, "rb") as input_file:
+        # Load the parameters from the file using pickle
         params = pickle.load(input_file)
+    # Update the global variables with the parameters from the file
     globals().update(params)
     try:
+        # Try to create a Dataset object with the loaded parameters
         dataset = Dataset(dataset_path, outcome_label, match_label, instance_label, outcome_type)
     except Exception:
+        # If an exception occurs, create a dataset tuple and save the metadata
         dataset = (dataset_path, outcome_label, match_label, instance_label, outcome_type)
         save_metadata(dotdict(params))
 
+    # Create a DataProcess object with the dataset and other parameters
     eda_obj = DataProcess(dataset, output_path + '/' + experiment_name,
                           ignore_features,
                           categorical_features, quantitative_features, exclude_eda_output,
                           categorical_cutoff, sig_cutoff, featureeng_missingness,
                           cleaning_missingness, correlation_removal_threshold, partition_method, n_splits,
                           random_state)
+    # Run the data processing task
     eda_obj.run(top_features)
 
 
-
+# If the script is executed directly, run the run_cluster function with command line arguments
 if __name__ == "__main__":
     sys.exit(run_cluster(sys.argv))
diff --git a/streamline/legacy/FImpJobSubmit.py b/streamline/legacy/FImpJobSubmit.py
@@ -3,23 +3,33 @@
 import pickle
 from pathlib import Path
 
+# Determine the directory where the script is located
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+# Add the grandparent directory of the script to the system path
+# This allows importing modules from two levels up
 sys.path.append(str(Path(SCRIPT_DIR).parent.parent))
 
+# Import the FeatureImportance class from the streamline.featurefns.importance module
 from streamline.featurefns.importance import FeatureImportance
 
-
+# Main function to run the feature importance analysis
 def run_cluster(argv):
+    # Get the path to the parameter file from the command line arguments
     param_path = argv[1]
+    # Open the parameter file in binary read mode
     with open(param_path, "rb") as input_file:
+        # Load the parameters from the file using pickle
         params = pickle.load(input_file)
+    # Update the global variables with the parameters from the file
     globals().update(params)
 
+    # Create an instance of the FeatureImportance class with the loaded parameters
     job_obj = FeatureImportance(cv_train_path, experiment_path, outcome_label,
                                 instance_label, instance_subset, algorithm,
                                 use_turf, turf_pct, random_state, n_jobs)
+    # Run the feature importance analysis
     job_obj.run()
 
-
+# If the script is executed directly, run the run_cluster function with command line arguments
 if __name__ == "__main__":
     sys.exit(run_cluster(sys.argv))
diff --git a/streamline/legacy/FSelJobSubmit.py b/streamline/legacy/FSelJobSubmit.py
@@ -3,24 +3,35 @@
 import pickle
 from pathlib import Path
 
+# Determine the directory where the script is located
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+# Add the grandparent directory of the script to the system path
+# This allows importing modules from two levels up
 sys.path.append(str(Path(SCRIPT_DIR).parent.parent))
 
+# Import the FeatureSelection class from the streamline.featurefns.selection module
 from streamline.featurefns.selection import FeatureSelection
 
-
+# Main function to run the feature selection process
 def run_cluster(argv):
+    # Get the path to the parameter file from the command line arguments
     param_path = argv[1]
+    # Open the parameter file in binary read mode
     with open(param_path, "rb") as input_file:
+        # Load the parameters from the file using pickle
         params = pickle.load(input_file)
+    # Update the global variables with the parameters from the file
     globals().update(params)
 
+    # Create an instance of the FeatureSelection class with the loaded parameters
     job_obj = FeatureSelection(full_path, n_datasets, algorithms,
                                outcome_label, instance_label, export_scores,
                                top_features, max_features_to_keep,
                                filter_poor_features, overwrite_cv)
+    # Run the feature selection process
     job_obj.run()
 
-
+# If the script is executed directly, run the run_cluster function with command line arguments
 if __name__ == "__main__":
+    # Exit the script with the status code returned by run_cluster
     sys.exit(run_cluster(sys.argv))
diff --git a/streamline/legacy/ModelJobSubmit.py b/streamline/legacy/ModelJobSubmit.py
@@ -3,26 +3,35 @@
 import sys
 from pathlib import Path
 
+# Determine the directory where the script is located
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+# Add the grandparent directory of the script to the system path
+# This allows importing modules from two levels up
 sys.path.append(str(Path(SCRIPT_DIR).parent.parent))
 
+# Import necessary classes and functions from the streamline package
 from streamline.modeling.modeljob import ModelJob
 from streamline.modeling.utils import get_fi_for_ExSTraCS
 
-
+# Main function to run the model training and evaluation
 def run_cluster(argv):
+    # Get the path to the parameter file from the command line arguments
     param_path = argv[1]
+    # Open the parameter file in binary read mode
     with open(param_path, "rb") as input_file:
+        # Load the parameters from the file using pickle
         params = pickle.load(input_file)
+    # Update the global variables with the parameters from the file
     globals().update(params)
     print(params)
     print(vars())
 
+    # Commented code for conditional imports based on outcome type with GlobalImport class
+    # Uncomment if needed for different outcome types
     # if outcome_type == "Binary":
     #     with GlobalImport() as gi:
     #         from streamline.modeling.classification_utils import model_str_to_obj
     #         gi()
-
     # elif outcome_type == "Continuous":
     #     if scoring_metric == 'balanced_accuracy':
     #         scoring_metric = 'explained_variance'
@@ -36,14 +45,17 @@ def run_cluster(argv):
     #         gi()
     # else:
     #     raise Exception("Unknown Outcome Type:" + str(outcome_type))
-
+
+    # Load metadata from a previously saved pickle file
     file = open(output_path + '/' + experiment_name + '/' + "metadata.pickle", 'rb')
     metadata = pickle.load(file)
     filter_poor_features = metadata['Filter Poor Features']
     outcome_type = metadata['Outcome Type']
     file.close()
+
     dataset_directory_path = full_path.split('/')[-1]
 
+    # Import the appropriate model function based on the outcome type
     if outcome_type == "Binary":
         from streamline.modeling.classification_utils import model_str_to_obj
     elif outcome_type == "Multiclass":
@@ -53,31 +65,37 @@ def run_cluster(argv):
     else:
         raise Exception("Unknown Outcome Type:" + str(outcome_type))
 
+    # Create an instance of the ModelJob class with the loaded parameters
     job_obj = ModelJob(full_path, output_path, experiment_name, cv_count, outcome_label,
                        instance_label, scoring_metric, metric_direction, n_trials,
                        timeout, training_subsample, uniform_fi, save_plots, random_state)
 
-
+    # Initialize the model based on the specified algorithm
     if algorithm not in ['eLCS', 'XCS', 'ExSTraCS']:
+        # Standard model initialization
         model = model_str_to_obj(algorithm)(cv_folds=3,
                                             scoring_metric=scoring_metric,
                                             metric_direction=metric_direction,
                                             random_state=random_state,
                                             cv=None, n_jobs=n_jobs)
     else:
+        # Special handling for LCS algorithms
         if algorithm == 'ExSTraCS':
+            # Get expert knowledge for ExSTraCS
             expert_knowledge = get_fi_for_ExSTraCS(output_path, experiment_name,
                                                    dataset_directory_path,
                                                    outcome_label, instance_label, cv_count,
                                                    filter_poor_features)
             if do_lcs_sweep:
+                # Initialize ExSTraCS with LCS sweep
                 model = model_str_to_obj(algorithm)(cv_folds=3,
                                                     scoring_metric=scoring_metric,
                                                     metric_direction=metric_direction,
                                                     random_state=random_state,
                                                     cv=None, n_jobs=n_jobs,
                                                     expert_knowledge=expert_knowledge)
             else:
+                # Initialize ExSTraCS with specific parameters
                 model = model_str_to_obj(algorithm)(cv_folds=3,
                                                     scoring_metric=scoring_metric,
                                                     metric_direction=metric_direction,
@@ -87,6 +105,7 @@ def run_cluster(argv):
                                                     N=lcs_n, nu=lcs_nu,
                                                     expert_knowledge=expert_knowledge)
         else:
+            # Initialize other LCS models
             if do_lcs_sweep:
                 model = model_str_to_obj(algorithm)(cv_folds=3,
                                                     scoring_metric=scoring_metric,
@@ -101,8 +120,10 @@ def run_cluster(argv):
                                                     cv=None, n_jobs=n_jobs,
                                                     iterations=lcs_iterations,
                                                     N=lcs_n, nu=lcs_nu)
-    job_obj.run(model)
 
+    # Run the model job with the initialized model
+    job_obj.run(model)
 
+# If the script is executed directly, run the run_cluster function with command line arguments
 if __name__ == "__main__":
     sys.exit(run_cluster(sys.argv))