added items as per Next Priority Updates doc

UrbsLab · May 21, 2024 · 7b04f70 · 7b04f70
1 parent b60ca38
commit 7b04f70
Show file tree

Hide file tree

Showing 7 changed files with 112 additions and 26 deletions.
diff --git a/streamline/featurefns/load_algorithms.py b/streamline/featurefns/load_algorithms.py
@@ -2,17 +2,38 @@
 import logging
 from pathlib import Path
 
-
 def load_class_from_folder(path=None):
+    """
+    Load classes from a specified folder. If no path is provided, 
+    defaults to the 'featurefns/algorithms' directory in the parent folder.
+
+    Args:
+    path (str): The path to the folder from which to load the classes.
+
+    Returns:
+    list: Sorted list of loaded classes based on their model_name attribute.
+    """
     if path is None:
+        # Default path to 'featurefns/algorithms' directory in the parent folder of the current file
         path = os.path.join(Path(__file__).parent.parent, 'featurefns/algorithms')
 
     classes = list()
+
+    # Iterate over Python files in the specified directory, excluding '__init__.py'
     for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
+        # Dynamically import the module
         mod = __import__('.'.join(['streamline.featurefns.algorithms', py]), fromlist=[py])
+
+        # Retrieve all class objects defined in the module
         classes_list = [getattr(mod, x) for x in dir(mod) if isinstance(getattr(mod, x), type)]
+
+        # Filter classes: include those in 'streamline' but exclude 'FeatureAlgorithm'
         for cls in classes_list:
             if ('streamline' in str(cls)) and not ('FeatureAlgorithm' in str(cls)):
                 classes.append(cls)
+
+    # Optional logging for debugging purposes
     # logging.warning(classes)
-    return sorted(classes, key=lambda x: x.model_name)
+
+    # Return the classes sorted by their model_name attribute
+    return sorted(classes, key=lambda x: x.model_name)
diff --git a/streamline/featurefns/selection.py b/streamline/featurefns/selection.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
-from statistics import median
+from statistics import median, mean
 from streamline.utils.job import Job
 from streamline.featurefns.utils import SUPPORTED_ALGORITHM_OBJ
 import seaborn as sns
@@ -164,11 +164,49 @@ def report_ave_fs(self, algorithm, algorithmlabel,
         # stores sorted feature importance dictionaries for all algorithms and CVs
         meta_feature_ranks[algorithm] = feature_name_ranks
 
-        # Generate barplot of average scores------------------------------------------------------------------------
+        # Generate boxplot and barplot of average scores------------------------------------------------------------------------
         if self.export_scores:
+            ns = self.export_fsel_plots(algorithm, algorithmlabel, cv_score_dict.copy(), 'Mean')
+            ns = self.export_fsel_plots(algorithm, algorithmlabel, cv_score_dict.copy(), 'Median')
+            self.export_fsel_boxplots(algorithm, algorithmlabel, cv_score_dict.copy(), order=list(ns['Names']))
+        return selected_feature_lists, meta_feature_ranks
+
+    def export_fsel_boxplots(self, algorithm, algorithmlabel, cv_score_dict, order=None):
+            boxplot_df = pd.DataFrame(cv_score_dict)
+            if order != None:
+                boxplot_df = boxplot_df[order]
+            boxplot_df.boxplot(fontsize=8, rot=90)
+            plt.xlabel('Features')
+            algorithm_name = ""
+            if algorithm == "MI":
+                algorithm_name = "Mutual Information"
+            elif algorithm == "MS":
+                algorithm_name = "MultiSURF"
+            elif algorithm == "MSS":
+                algorithm_name = "MultiSURFstar"
+            plt.ylabel(str(algorithm_name) + ' Score')
+            plt.xticks(np.arange(len(boxplot_df.columns)), boxplot_df.columns)
+            plt.title(str(algorithm_name) + 'Score Boxplots')
+            logging.info("Saved Feature Importance Plots at")
+            logging.info(self.full_path + "/feature_selection/" + algorithmlabel + "/ScoresBoxplots.png")
+            plt.savefig((self.full_path + "/feature_selection/" + algorithmlabel + "/ScoresBoxplots.png"),
+                        bbox_inches="tight")
+            if self.show_plots:
+                plt.show()
+            else:
+                plt.close('all')
+            plt.cla() # not required
+
+    def export_fsel_plots(self, algorithm, algorithmlabel, cv_score_dict, metric='Mean'):
+
+
             # Get median score for each features
+            if metric == 'Mean':
+                metric_fn = mean
+            elif metric == 'Median':
+                metric_fn = median
             for v in cv_score_dict:
-                cv_score_dict[v] = median(cv_score_dict[v])
+                cv_score_dict[v] = metric_fn(cv_score_dict[v])
             df_string = pd.DataFrame(cv_score_dict.items(), columns=['Feature', 'Importance'])\
                 .sort_values('Importance', ascending=False).head(10).to_string()
             logging.info(df_string)
@@ -202,19 +240,19 @@ def report_ave_fs(self, algorithm, algorithmlabel,
                 algorithm_name = "MultiSURF"
             elif algorithm == "MSS":
                 algorithm_name = "MultiSURFstar"
-            plt.xlabel(str(algorithm_name) + ' Median Score')
+            plt.xlabel(str(algorithm_name) + ' ' + metric + ' Score')
             plt.yticks(np.arange(len(ns['Names'])), ns['Names'])
-            plt.title('Sorted Median ' + str(algorithm_name) + ' Scores')
+            plt.title('Sorted ' + metric + ' ' + str(algorithm_name) + ' Scores')
             logging.info("Saved Feature Importance Plots at")
-            logging.info(self.full_path + "/feature_selection/" + algorithmlabel + "/TopAverageScores.png")
-            plt.savefig((self.full_path + "/feature_selection/" + algorithmlabel + "/TopAverageScores.png"),
+            logging.info(self.full_path + "/feature_selection/" + algorithmlabel + "/Top" + metric + "Scores.png")
+            plt.savefig((self.full_path + "/feature_selection/" + algorithmlabel + "/Top" + metric + "Scores.png"),
                         bbox_inches="tight")
             if self.show_plots:
                 plt.show()
             else:
                 plt.close('all')
                 # plt.cla() # not required
-        return selected_feature_lists, meta_feature_ranks
+            return ns
 
     def select_features(self, selected_feature_lists, max_features_to_keep, meta_feature_ranks):
         """

diff --git a/streamline/featurefns/utils.py b/streamline/featurefns/utils.py
@@ -1,26 +1,40 @@
 import os
 from streamline.featurefns.load_algorithms import load_class_from_folder
 
+# Load all algorithm classes from the specified folder
 SUPPORTED_ALGORITHM_OBJ = load_class_from_folder()
 
+# Extract the model names from the loaded algorithm objects
 SUPPORTED_ALGORITHM = [m.model_name for m in SUPPORTED_ALGORITHM_OBJ]
 
+# Extract the small names (short names) from the loaded algorithm objects
 SUPPORTED_ALGORITHM_SMALL = [m.small_name for m in SUPPORTED_ALGORITHM_OBJ]
 
+# Create a dictionary mapping both model names and small names to their respective algorithm objects
 ALGORITHM_DICT = dict(zip(SUPPORTED_ALGORITHM + SUPPORTED_ALGORITHM_SMALL,
-                      SUPPORTED_ALGORITHM_OBJ + SUPPORTED_ALGORITHM_OBJ))
+                          SUPPORTED_ALGORITHM_OBJ + SUPPORTED_ALGORITHM_OBJ))
 
+# Create a dictionary mapping both model names and small names to themselves (for label checking)
 LABELS = dict(zip(SUPPORTED_ALGORITHM + SUPPORTED_ALGORITHM_SMALL,
-                  SUPPORTED_ALGORITHM + SUPPORTED_ALGORITHM))
+                  SUPPORTED_ALGORITHM + SUPPORTED_ALGORITHM_SMALL))
 
 
 def is_supported_algorithm(string):
+    """
+    Check if the given string corresponds to a supported algorithm.
+    If it is supported, return the label.
+    If not, raise an exception.
+    """
     try:
         return LABELS[string]
     except KeyError:
         raise Exception("Unknown Model")
 
 
 def algorithm_str_to_obj(string):
-    assert is_supported_algorithm(string)
-    return ALGORITHM_DICT[string]
+    """
+    Convert an algorithm name (or small name) to its corresponding algorithm object.
+    This function asserts that the given string is a supported algorithm.
+    """
+    assert is_supported_algorithm(string)  # Ensure the string is a supported algorithm
+    return ALGORITHM_DICT[string]  # Return the corresponding algorithm object
diff --git a/streamline/modeling/load_models.py b/streamline/modeling/load_models.py
@@ -3,6 +3,15 @@
 from pathlib import Path
 
 def load_class_from_folder(model_type="BinaryClassification"):
+    """
+    Load classes from a specified folder based on the model type.
+    
+    Args:
+    model_type (str): Type of model to load (BinaryClassification, MulticlassClassification, Regression).
+    
+    Returns:
+    list: Sorted list of loaded classes based on their model_name attribute.
+    """
     folder_path, package_path = None, None
 
     # Determine the folder path and package path based on the model type
@@ -18,9 +27,9 @@ def load_class_from_folder(model_type="BinaryClassification"):
 
     classes = list()
 
-    # Iterate over Python files in the specified folder
+    # Iterate over Python files in the specified folder, excluding '__init__.py'
     for py in [f[:-3] for f in os.listdir(folder_path) if f.endswith('.py') and f != '__init__.py']:
-        # Import the module dynamically
+        # Dynamically import the module
         mod = __import__('.'.join([package_path, py]), fromlist=[py])
         # Get all class objects defined in the module
         classes_list = [getattr(mod, x) for x in dir(mod) if isinstance(getattr(mod, x), type)]

diff --git a/streamline/postanalysis/gererate_report.py b/streamline/postanalysis/gererate_report.py
@@ -941,11 +941,11 @@ def job(self):
                 self.analysis_report.set_font(family='times', size=9)
                 self.analysis_report.image(
                     self.experiment_path + '/' + self.datasets[
-                        k] + '/feature_selection/mutual_information/TopAverageScores.png',
+                        k] + '/feature_selection/mutual_information/TopMeanScores.png',
                     5,
                     12, 100, 135)  # Images adjusted to fit a width of 100 and length of 135
                 self.analysis_report.image(
-                    self.experiment_path + '/' + self.datasets[k] + '/feature_selection/multisurf/TopAverageScores.png',
+                    self.experiment_path + '/' + self.datasets[k] + '/feature_selection/multisurf/TopMeanScores.png',
                     105, 12,
                     100,
                     135)

diff --git a/streamline/postanalysis/statistics.py b/streamline/postanalysis/statistics.py
@@ -550,9 +550,10 @@ def primary_stats_regression(self, master_list=None):
             results = {'Max Error': mes, 'Mean Absolute Error': maes, 'Mean Squared Error': mses,
                        'Median Absolute Error': mdaes, 'Explained Variance': evss, 'Pearson Correlation': corrs}
             dr = pd.DataFrame(results)
+            dr.index = list(range(0, self.cv_partitions))
+            dr.index.name = 'CV #'
             filepath = self.full_path + '/model_evaluation/' + self.abbrev[algorithm] + "_performance.csv"
-            dr.to_csv(filepath, header=True, index=False)
-            metric_dict[algorithm] = results
+            dr.to_csv(filepath, header=True, index=True)
 
             # Save Average FI Stats
             if master_list is None:
@@ -702,8 +703,10 @@ def primary_stats_multiclass(self, master_list=None, rep_data=None):
                        'Precision (PPV)': s_pr, 'ROC AUC': aucs, 'PRC AUC': praucs,
                        'PRC APS': aveprecs}
             dr = pd.DataFrame(results)
+            dr.index = list(range(0, self.cv_partitions))
+            dr.index.name = 'CV #'
             filepath = self.full_path + '/model_evaluation/' + self.abbrev[algorithm] + "_performance.csv"
-            dr.to_csv(filepath, header=True, index=False)
+            dr.to_csv(filepath, header=True, index=True)
             metric_dict[algorithm] = results
 
             # Save Median FI Stats
@@ -866,9 +869,10 @@ def primary_stats_classification(self, master_list=None, rep_data=None):
                        'NPV': s_npv, 'LR+': s_lrp, 'LR-': s_lrm, 'ROC AUC': aucs, 'PRC AUC': praucs,
                        'PRC APS': aveprecs}
             dr = pd.DataFrame(results)
+            dr.index = list(range(0, self.cv_partitions))
+            dr.index.name = 'CV #'
             filepath = self.full_path + '/model_evaluation/' + self.abbrev[algorithm] + "_performance.csv"
-            dr.to_csv(filepath, header=True, index=False)
-            metric_dict[algorithm] = results
+            dr.to_csv(filepath, header=True, index=True)
 
             # Save FI scores for all CV models
             if master_list is None:
@@ -978,7 +982,7 @@ def do_model_prc(self, algorithm, precs, praucs, mean_recall, alg_result_table,
 
             no_skill = len(test_y[test_y == 1]) / len(test_y)  # Fraction of cases
             # Plot no-skill line
-            plt.plot([0, 1], [no_skill, no_skill], color='black', linestyle='--', label='No-Skill', alpha=.8)
+            plt.plot([0, 1], [no_skill, no_skill], color='black', linestyle='--', label='No-Skill (cutoff = %0.3f)' % (no_skill), alpha=.8)
             # Plot average line for all CVs
             std_pr_auc = np.std(praucs)
             plt.plot(mean_recall, mean_prec, color=self.colors[algorithm],
@@ -1062,7 +1066,7 @@ def do_plot_prc(self, result_table, rep_data=None, replicate=False):
         no_skill = len(test_y[test_y == 1]) / len(test_y)  # Fraction of cases
 
         # Plot no-skill line
-        plt.plot([0, 1], [no_skill, no_skill], color='black', linestyle='--', label='No-Skill', alpha=.8)
+        plt.plot([0, 1], [no_skill, no_skill], color='black', linestyle='--', label='No-Skill (cutoff = %0.3f)' % (no_skill), alpha=.8)
         # Specify plot axes,labels, and legend
         plt.xticks(np.arange(0.0, 1.1, step=0.1))
         plt.xlabel("Recall (Sensitivity)", fontsize=15)

diff --git a/streamline/tests/test_classification.py b/streamline/tests/test_classification.py
@@ -15,7 +15,7 @@
 
 # pytest.skip("Tested Already", allow_module_level=True)
 
-algorithms, run_parallel, output_path = ["MI", "MS", "MSS"], False, "./tests/"
+algorithms, run_parallel, output_path = ["MI", "MS"], False, "./tests/"
 dataset_path, experiment_name = "./data/DemoData/", "demo"
 model_algorithms = ["LR", "DT", "NB"]
 rep_data_path = "./data/DemoRepData/"