Skip to content

Commit

Permalink
added items as per Next Priority Updates doc
Browse files Browse the repository at this point in the history
  • Loading branch information
raptor419 committed May 21, 2024
1 parent b60ca38 commit 7b04f70
Show file tree
Hide file tree
Showing 7 changed files with 112 additions and 26 deletions.
25 changes: 23 additions & 2 deletions streamline/featurefns/load_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,38 @@
import logging
from pathlib import Path


def load_class_from_folder(path=None):
"""
Load classes from a specified folder. If no path is provided,
defaults to the 'featurefns/algorithms' directory in the parent folder.
Args:
path (str): The path to the folder from which to load the classes.
Returns:
list: Sorted list of loaded classes based on their model_name attribute.
"""
if path is None:
# Default path to 'featurefns/algorithms' directory in the parent folder of the current file
path = os.path.join(Path(__file__).parent.parent, 'featurefns/algorithms')

classes = list()

# Iterate over Python files in the specified directory, excluding '__init__.py'
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
# Dynamically import the module
mod = __import__('.'.join(['streamline.featurefns.algorithms', py]), fromlist=[py])

# Retrieve all class objects defined in the module
classes_list = [getattr(mod, x) for x in dir(mod) if isinstance(getattr(mod, x), type)]

# Filter classes: include those in 'streamline' but exclude 'FeatureAlgorithm'
for cls in classes_list:
if ('streamline' in str(cls)) and not ('FeatureAlgorithm' in str(cls)):
classes.append(cls)

# Optional logging for debugging purposes
# logging.warning(classes)
return sorted(classes, key=lambda x: x.model_name)

# Return the classes sorted by their model_name attribute
return sorted(classes, key=lambda x: x.model_name)
54 changes: 46 additions & 8 deletions streamline/featurefns/selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statistics import median
from statistics import median, mean
from streamline.utils.job import Job
from streamline.featurefns.utils import SUPPORTED_ALGORITHM_OBJ
import seaborn as sns
Expand Down Expand Up @@ -164,11 +164,49 @@ def report_ave_fs(self, algorithm, algorithmlabel,
# stores sorted feature importance dictionaries for all algorithms and CVs
meta_feature_ranks[algorithm] = feature_name_ranks

# Generate barplot of average scores------------------------------------------------------------------------
# Generate boxplot and barplot of average scores------------------------------------------------------------------------
if self.export_scores:
ns = self.export_fsel_plots(algorithm, algorithmlabel, cv_score_dict.copy(), 'Mean')
ns = self.export_fsel_plots(algorithm, algorithmlabel, cv_score_dict.copy(), 'Median')
self.export_fsel_boxplots(algorithm, algorithmlabel, cv_score_dict.copy(), order=list(ns['Names']))
return selected_feature_lists, meta_feature_ranks

def export_fsel_boxplots(self, algorithm, algorithmlabel, cv_score_dict, order=None):
boxplot_df = pd.DataFrame(cv_score_dict)
if order != None:
boxplot_df = boxplot_df[order]
boxplot_df.boxplot(fontsize=8, rot=90)
plt.xlabel('Features')
algorithm_name = ""
if algorithm == "MI":
algorithm_name = "Mutual Information"
elif algorithm == "MS":
algorithm_name = "MultiSURF"
elif algorithm == "MSS":
algorithm_name = "MultiSURFstar"
plt.ylabel(str(algorithm_name) + ' Score')
plt.xticks(np.arange(len(boxplot_df.columns)), boxplot_df.columns)
plt.title(str(algorithm_name) + 'Score Boxplots')
logging.info("Saved Feature Importance Plots at")
logging.info(self.full_path + "/feature_selection/" + algorithmlabel + "/ScoresBoxplots.png")
plt.savefig((self.full_path + "/feature_selection/" + algorithmlabel + "/ScoresBoxplots.png"),
bbox_inches="tight")
if self.show_plots:
plt.show()
else:
plt.close('all')
plt.cla() # not required

def export_fsel_plots(self, algorithm, algorithmlabel, cv_score_dict, metric='Mean'):


# Get median score for each features
if metric == 'Mean':
metric_fn = mean
elif metric == 'Median':
metric_fn = median
for v in cv_score_dict:
cv_score_dict[v] = median(cv_score_dict[v])
cv_score_dict[v] = metric_fn(cv_score_dict[v])
df_string = pd.DataFrame(cv_score_dict.items(), columns=['Feature', 'Importance'])\
.sort_values('Importance', ascending=False).head(10).to_string()
logging.info(df_string)
Expand Down Expand Up @@ -202,19 +240,19 @@ def report_ave_fs(self, algorithm, algorithmlabel,
algorithm_name = "MultiSURF"
elif algorithm == "MSS":
algorithm_name = "MultiSURFstar"
plt.xlabel(str(algorithm_name) + ' Median Score')
plt.xlabel(str(algorithm_name) + ' ' + metric + ' Score')
plt.yticks(np.arange(len(ns['Names'])), ns['Names'])
plt.title('Sorted Median ' + str(algorithm_name) + ' Scores')
plt.title('Sorted ' + metric + ' ' + str(algorithm_name) + ' Scores')
logging.info("Saved Feature Importance Plots at")
logging.info(self.full_path + "/feature_selection/" + algorithmlabel + "/TopAverageScores.png")
plt.savefig((self.full_path + "/feature_selection/" + algorithmlabel + "/TopAverageScores.png"),
logging.info(self.full_path + "/feature_selection/" + algorithmlabel + "/Top" + metric + "Scores.png")
plt.savefig((self.full_path + "/feature_selection/" + algorithmlabel + "/Top" + metric + "Scores.png"),
bbox_inches="tight")
if self.show_plots:
plt.show()
else:
plt.close('all')
# plt.cla() # not required
return selected_feature_lists, meta_feature_ranks
return ns

def select_features(self, selected_feature_lists, max_features_to_keep, meta_feature_ranks):
"""
Expand Down
22 changes: 18 additions & 4 deletions streamline/featurefns/utils.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,40 @@
import os
from streamline.featurefns.load_algorithms import load_class_from_folder

# Load all algorithm classes from the specified folder
SUPPORTED_ALGORITHM_OBJ = load_class_from_folder()

# Extract the model names from the loaded algorithm objects
SUPPORTED_ALGORITHM = [m.model_name for m in SUPPORTED_ALGORITHM_OBJ]

# Extract the small names (short names) from the loaded algorithm objects
SUPPORTED_ALGORITHM_SMALL = [m.small_name for m in SUPPORTED_ALGORITHM_OBJ]

# Create a dictionary mapping both model names and small names to their respective algorithm objects
ALGORITHM_DICT = dict(zip(SUPPORTED_ALGORITHM + SUPPORTED_ALGORITHM_SMALL,
SUPPORTED_ALGORITHM_OBJ + SUPPORTED_ALGORITHM_OBJ))
SUPPORTED_ALGORITHM_OBJ + SUPPORTED_ALGORITHM_OBJ))

# Create a dictionary mapping both model names and small names to themselves (for label checking)
LABELS = dict(zip(SUPPORTED_ALGORITHM + SUPPORTED_ALGORITHM_SMALL,
SUPPORTED_ALGORITHM + SUPPORTED_ALGORITHM))
SUPPORTED_ALGORITHM + SUPPORTED_ALGORITHM_SMALL))


def is_supported_algorithm(string):
"""
Check if the given string corresponds to a supported algorithm.
If it is supported, return the label.
If not, raise an exception.
"""
try:
return LABELS[string]
except KeyError:
raise Exception("Unknown Model")


def algorithm_str_to_obj(string):
assert is_supported_algorithm(string)
return ALGORITHM_DICT[string]
"""
Convert an algorithm name (or small name) to its corresponding algorithm object.
This function asserts that the given string is a supported algorithm.
"""
assert is_supported_algorithm(string) # Ensure the string is a supported algorithm
return ALGORITHM_DICT[string] # Return the corresponding algorithm object
13 changes: 11 additions & 2 deletions streamline/modeling/load_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@
from pathlib import Path

def load_class_from_folder(model_type="BinaryClassification"):
"""
Load classes from a specified folder based on the model type.
Args:
model_type (str): Type of model to load (BinaryClassification, MulticlassClassification, Regression).
Returns:
list: Sorted list of loaded classes based on their model_name attribute.
"""
folder_path, package_path = None, None

# Determine the folder path and package path based on the model type
Expand All @@ -18,9 +27,9 @@ def load_class_from_folder(model_type="BinaryClassification"):

classes = list()

# Iterate over Python files in the specified folder
# Iterate over Python files in the specified folder, excluding '__init__.py'
for py in [f[:-3] for f in os.listdir(folder_path) if f.endswith('.py') and f != '__init__.py']:
# Import the module dynamically
# Dynamically import the module
mod = __import__('.'.join([package_path, py]), fromlist=[py])
# Get all class objects defined in the module
classes_list = [getattr(mod, x) for x in dir(mod) if isinstance(getattr(mod, x), type)]
Expand Down
4 changes: 2 additions & 2 deletions streamline/postanalysis/gererate_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -941,11 +941,11 @@ def job(self):
self.analysis_report.set_font(family='times', size=9)
self.analysis_report.image(
self.experiment_path + '/' + self.datasets[
k] + '/feature_selection/mutual_information/TopAverageScores.png',
k] + '/feature_selection/mutual_information/TopMeanScores.png',
5,
12, 100, 135) # Images adjusted to fit a width of 100 and length of 135
self.analysis_report.image(
self.experiment_path + '/' + self.datasets[k] + '/feature_selection/multisurf/TopAverageScores.png',
self.experiment_path + '/' + self.datasets[k] + '/feature_selection/multisurf/TopMeanScores.png',
105, 12,
100,
135)
Expand Down
18 changes: 11 additions & 7 deletions streamline/postanalysis/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,9 +550,10 @@ def primary_stats_regression(self, master_list=None):
results = {'Max Error': mes, 'Mean Absolute Error': maes, 'Mean Squared Error': mses,
'Median Absolute Error': mdaes, 'Explained Variance': evss, 'Pearson Correlation': corrs}
dr = pd.DataFrame(results)
dr.index = list(range(0, self.cv_partitions))
dr.index.name = 'CV #'
filepath = self.full_path + '/model_evaluation/' + self.abbrev[algorithm] + "_performance.csv"
dr.to_csv(filepath, header=True, index=False)
metric_dict[algorithm] = results
dr.to_csv(filepath, header=True, index=True)

# Save Average FI Stats
if master_list is None:
Expand Down Expand Up @@ -702,8 +703,10 @@ def primary_stats_multiclass(self, master_list=None, rep_data=None):
'Precision (PPV)': s_pr, 'ROC AUC': aucs, 'PRC AUC': praucs,
'PRC APS': aveprecs}
dr = pd.DataFrame(results)
dr.index = list(range(0, self.cv_partitions))
dr.index.name = 'CV #'
filepath = self.full_path + '/model_evaluation/' + self.abbrev[algorithm] + "_performance.csv"
dr.to_csv(filepath, header=True, index=False)
dr.to_csv(filepath, header=True, index=True)
metric_dict[algorithm] = results

# Save Median FI Stats
Expand Down Expand Up @@ -866,9 +869,10 @@ def primary_stats_classification(self, master_list=None, rep_data=None):
'NPV': s_npv, 'LR+': s_lrp, 'LR-': s_lrm, 'ROC AUC': aucs, 'PRC AUC': praucs,
'PRC APS': aveprecs}
dr = pd.DataFrame(results)
dr.index = list(range(0, self.cv_partitions))
dr.index.name = 'CV #'
filepath = self.full_path + '/model_evaluation/' + self.abbrev[algorithm] + "_performance.csv"
dr.to_csv(filepath, header=True, index=False)
metric_dict[algorithm] = results
dr.to_csv(filepath, header=True, index=True)

# Save FI scores for all CV models
if master_list is None:
Expand Down Expand Up @@ -978,7 +982,7 @@ def do_model_prc(self, algorithm, precs, praucs, mean_recall, alg_result_table,

no_skill = len(test_y[test_y == 1]) / len(test_y) # Fraction of cases
# Plot no-skill line
plt.plot([0, 1], [no_skill, no_skill], color='black', linestyle='--', label='No-Skill', alpha=.8)
plt.plot([0, 1], [no_skill, no_skill], color='black', linestyle='--', label='No-Skill (cutoff = %0.3f)' % (no_skill), alpha=.8)
# Plot average line for all CVs
std_pr_auc = np.std(praucs)
plt.plot(mean_recall, mean_prec, color=self.colors[algorithm],
Expand Down Expand Up @@ -1062,7 +1066,7 @@ def do_plot_prc(self, result_table, rep_data=None, replicate=False):
no_skill = len(test_y[test_y == 1]) / len(test_y) # Fraction of cases

# Plot no-skill line
plt.plot([0, 1], [no_skill, no_skill], color='black', linestyle='--', label='No-Skill', alpha=.8)
plt.plot([0, 1], [no_skill, no_skill], color='black', linestyle='--', label='No-Skill (cutoff = %0.3f)' % (no_skill), alpha=.8)
# Specify plot axes,labels, and legend
plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("Recall (Sensitivity)", fontsize=15)
Expand Down
2 changes: 1 addition & 1 deletion streamline/tests/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

# pytest.skip("Tested Already", allow_module_level=True)

algorithms, run_parallel, output_path = ["MI", "MS", "MSS"], False, "./tests/"
algorithms, run_parallel, output_path = ["MI", "MS"], False, "./tests/"
dataset_path, experiment_name = "./data/DemoData/", "demo"
model_algorithms = ["LR", "DT", "NB"]
rep_data_path = "./data/DemoRepData/"
Expand Down

0 comments on commit 7b04f70

Please sign in to comment.