Skip to content

Commit

Permalink
Added comments to the basemodels, submodels, modelutils, (“modeling” …
Browse files Browse the repository at this point in the history
…folder) and legacy phase of streamline (in "legacy" folder)
  • Loading branch information
raptor419 committed May 17, 2024
1 parent cbb01d7 commit fb67832
Show file tree
Hide file tree
Showing 18 changed files with 367 additions and 237 deletions.
13 changes: 11 additions & 2 deletions streamline/legacy/CompareJobSubmit.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,31 @@
import pickle
from pathlib import Path

# Determine the directory where the script is located
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
# Add the grandparent directory to the system path to allow importing modules from there
sys.path.append(str(Path(SCRIPT_DIR).parent.parent))

# Import the CompareJob class from the specified module
from streamline.postanalysis.dataset_compare import CompareJob


def run_cluster(argv):
# The first argument is expected to be the path to a parameters file
param_path = argv[1]

# Load the parameters from the specified file using pickle
with open(param_path, "rb") as input_file:
params = pickle.load(input_file)

# Update the global namespace with the loaded parameters
globals().update(params)

# Instantiate the CompareJob class with the loaded parameters
job_obj = CompareJob(output_path, experiment_name, experiment_path,
outcome_label, outcome_type, instance_label, sig_cutoff, show_plots)
# Run the job
job_obj.run()


# If the script is run as the main module, execute the run_cluster function
if __name__ == "__main__":
sys.exit(run_cluster(sys.argv))
14 changes: 13 additions & 1 deletion streamline/legacy/DataJobSubmit.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,38 @@
import pickle
from pathlib import Path

# Determine the directory where the script is located
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
# Add the grandparent directory of the script to the system path
# This allows importing modules from two levels up
sys.path.append(str(Path(SCRIPT_DIR).parent.parent))

# Import the ScaleAndImpute class from the streamline.dataprep.scale_and_impute module
from streamline.dataprep.scale_and_impute import ScaleAndImpute


def run_cluster(argv):
# Get the path to the parameter file from the command line arguments
param_path = argv[1]
# Open the parameter file in binary read mode
with open(param_path, "rb") as input_file:
# Load the parameters from the file using pickle
params = pickle.load(input_file)
# Update the global variables with the parameters from the file
globals().update(params)
# Construct the full output path for the experiment
full_path = output_path + "/" + experiment_name


# Create an instance of the ScaleAndImpute class with the loaded parameters
job_obj = ScaleAndImpute(cv_train_path, cv_test_path,
full_path,
scale_data, impute_data, multi_impute, overwrite_cv,
outcome_label, instance_label, random_state)
# Run the scaling and imputation process
job_obj.run()


if __name__ == "__main__":
# Execute the run_cluster function with command line arguments
# and exit the program with the return value of run_cluster
sys.exit(run_cluster(sys.argv))
71 changes: 43 additions & 28 deletions streamline/legacy/EDAJobSubmit.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,68 +3,83 @@
import pickle
from pathlib import Path

# Determine the directory where the script is located
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
# Add the grandparent directory of the script to the system path
# This allows importing modules from two levels up
sys.path.append(str(Path(SCRIPT_DIR).parent.parent))

# Import Dataset and DataProcess classes from the streamline package
from streamline.utils.dataset import Dataset
from streamline.dataprep.data_process import DataProcess

# Define a custom dictionary class with dot notation access for attributes
class dotdict(dict):
def __getattr__(self, key):
return self[key]

def __setattr__(self, key, value):
self[key] = value

# Function to save metadata related to data processing
def save_metadata(self):
metadata = dict()
metadata['Data Path'] = self.data_path
metadata['Output Path'] = self.output_path
metadata['Experiment Name'] = self.experiment_name
metadata['Outcome Label'] = self.outcome_label
metadata['Outcome Type'] = self.outcome_type
metadata['Instance Label'] = self.instance_label
metadata['Match Label'] = self.match_label
metadata['Ignored Features'] = self.ignore_features
metadata['Specified Categorical Features'] = self.categorical_features
metadata['Specified Quantitative Features'] = self.quantitative_features
metadata['CV Partitions'] = self.n_splits
metadata['Partition Method'] = self.partition_method
metadata['Categorical Cutoff'] = self.categorical_cutoff
metadata['Statistical Significance Cutoff'] = self.sig_cutoff
metadata['Engineering Missingness Cutoff'] = self.featureeng_missingness
metadata['Cleaning Missingness Cutoff'] = self.cleaning_missingness
metadata['Correlation Removal Threshold'] = self.correlation_removal_threshold
metadata['List of Exploratory Analysis Ran'] = self.exploration_list
metadata['List of Exploratory Plots Saved'] = self.plot_list
metadata['Random Seed'] = self.random_state
metadata['Run From Notebook'] = self.show_plots
# Pickle the metadata for future use
pickle_out = open(self.output_path + '/' + self.experiment_name + '/' + "metadata.pickle", 'wb')
pickle.dump(metadata, pickle_out)
pickle_out.close()

metadata = dict()
# Populate the metadata dictionary with relevant attributes
metadata['Data Path'] = self.data_path
metadata['Output Path'] = self.output_path
metadata['Experiment Name'] = self.experiment_name
metadata['Outcome Label'] = self.outcome_label
metadata['Outcome Type'] = self.outcome_type
metadata['Instance Label'] = self.instance_label
metadata['Match Label'] = self.match_label
metadata['Ignored Features'] = self.ignore_features
metadata['Specified Categorical Features'] = self.categorical_features
metadata['Specified Quantitative Features'] = self.quantitative_features
metadata['CV Partitions'] = self.n_splits
metadata['Partition Method'] = self.partition_method
metadata['Categorical Cutoff'] = self.categorical_cutoff
metadata['Statistical Significance Cutoff'] = self.sig_cutoff
metadata['Engineering Missingness Cutoff'] = self.featureeng_missingness
metadata['Cleaning Missingness Cutoff'] = self.cleaning_missingness
metadata['Correlation Removal Threshold'] = self.correlation_removal_threshold
metadata['List of Exploratory Analysis Ran'] = self.exploration_list
metadata['List of Exploratory Plots Saved'] = self.plot_list
metadata['Random Seed'] = self.random_state
metadata['Run From Notebook'] = self.show_plots
# Pickle the metadata for future use
pickle_out = open(self.output_path + '/' + self.experiment_name + '/' + "metadata.pickle", 'wb')
pickle.dump(metadata, pickle_out)
pickle_out.close()

# Main function to run clustering analysis
def run_cluster(argv):
# Get the path to the parameter file from the command line arguments
param_path = argv[1]
# Open the parameter file in binary read mode
with open(param_path, "rb") as input_file:
# Load the parameters from the file using pickle
params = pickle.load(input_file)
# Update the global variables with the parameters from the file
globals().update(params)
try:
# Try to create a Dataset object with the loaded parameters
dataset = Dataset(dataset_path, outcome_label, match_label, instance_label, outcome_type)
except Exception:
# If an exception occurs, create a dataset tuple and save the metadata
dataset = (dataset_path, outcome_label, match_label, instance_label, outcome_type)
save_metadata(dotdict(params))

# Create a DataProcess object with the dataset and other parameters
eda_obj = DataProcess(dataset, output_path + '/' + experiment_name,
ignore_features,
categorical_features, quantitative_features, exclude_eda_output,
categorical_cutoff, sig_cutoff, featureeng_missingness,
cleaning_missingness, correlation_removal_threshold, partition_method, n_splits,
random_state)
# Run the data processing task
eda_obj.run(top_features)



# If the script is executed directly, run the run_cluster function with command line arguments
if __name__ == "__main__":
sys.exit(run_cluster(sys.argv))
14 changes: 12 additions & 2 deletions streamline/legacy/FImpJobSubmit.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,33 @@
import pickle
from pathlib import Path

# Determine the directory where the script is located
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
# Add the grandparent directory of the script to the system path
# This allows importing modules from two levels up
sys.path.append(str(Path(SCRIPT_DIR).parent.parent))

# Import the FeatureImportance class from the streamline.featurefns.importance module
from streamline.featurefns.importance import FeatureImportance


# Main function to run the feature importance analysis
def run_cluster(argv):
# Get the path to the parameter file from the command line arguments
param_path = argv[1]
# Open the parameter file in binary read mode
with open(param_path, "rb") as input_file:
# Load the parameters from the file using pickle
params = pickle.load(input_file)
# Update the global variables with the parameters from the file
globals().update(params)

# Create an instance of the FeatureImportance class with the loaded parameters
job_obj = FeatureImportance(cv_train_path, experiment_path, outcome_label,
instance_label, instance_subset, algorithm,
use_turf, turf_pct, random_state, n_jobs)
# Run the feature importance analysis
job_obj.run()


# If the script is executed directly, run the run_cluster function with command line arguments
if __name__ == "__main__":
sys.exit(run_cluster(sys.argv))
15 changes: 13 additions & 2 deletions streamline/legacy/FSelJobSubmit.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,35 @@
import pickle
from pathlib import Path

# Determine the directory where the script is located
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
# Add the grandparent directory of the script to the system path
# This allows importing modules from two levels up
sys.path.append(str(Path(SCRIPT_DIR).parent.parent))

# Import the FeatureSelection class from the streamline.featurefns.selection module
from streamline.featurefns.selection import FeatureSelection


# Main function to run the feature selection process
def run_cluster(argv):
# Get the path to the parameter file from the command line arguments
param_path = argv[1]
# Open the parameter file in binary read mode
with open(param_path, "rb") as input_file:
# Load the parameters from the file using pickle
params = pickle.load(input_file)
# Update the global variables with the parameters from the file
globals().update(params)

# Create an instance of the FeatureSelection class with the loaded parameters
job_obj = FeatureSelection(full_path, n_datasets, algorithms,
outcome_label, instance_label, export_scores,
top_features, max_features_to_keep,
filter_poor_features, overwrite_cv)
# Run the feature selection process
job_obj.run()


# If the script is executed directly, run the run_cluster function with command line arguments
if __name__ == "__main__":
# Exit the script with the status code returned by run_cluster
sys.exit(run_cluster(sys.argv))
31 changes: 26 additions & 5 deletions streamline/legacy/ModelJobSubmit.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,35 @@
import sys
from pathlib import Path

# Determine the directory where the script is located
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
# Add the grandparent directory of the script to the system path
# This allows importing modules from two levels up
sys.path.append(str(Path(SCRIPT_DIR).parent.parent))

# Import necessary classes and functions from the streamline package
from streamline.modeling.modeljob import ModelJob
from streamline.modeling.utils import get_fi_for_ExSTraCS


# Main function to run the model training and evaluation
def run_cluster(argv):
# Get the path to the parameter file from the command line arguments
param_path = argv[1]
# Open the parameter file in binary read mode
with open(param_path, "rb") as input_file:
# Load the parameters from the file using pickle
params = pickle.load(input_file)
# Update the global variables with the parameters from the file
globals().update(params)
print(params)
print(vars())

# Commented code for conditional imports based on outcome type with GlobalImport class
# Uncomment if needed for different outcome types
# if outcome_type == "Binary":
# with GlobalImport() as gi:
# from streamline.modeling.classification_utils import model_str_to_obj
# gi()

# elif outcome_type == "Continuous":
# if scoring_metric == 'balanced_accuracy':
# scoring_metric = 'explained_variance'
Expand All @@ -36,14 +45,17 @@ def run_cluster(argv):
# gi()
# else:
# raise Exception("Unknown Outcome Type:" + str(outcome_type))


# Load metadata from a previously saved pickle file
file = open(output_path + '/' + experiment_name + '/' + "metadata.pickle", 'rb')
metadata = pickle.load(file)
filter_poor_features = metadata['Filter Poor Features']
outcome_type = metadata['Outcome Type']
file.close()

dataset_directory_path = full_path.split('/')[-1]

# Import the appropriate model function based on the outcome type
if outcome_type == "Binary":
from streamline.modeling.classification_utils import model_str_to_obj
elif outcome_type == "Multiclass":
Expand All @@ -53,31 +65,37 @@ def run_cluster(argv):
else:
raise Exception("Unknown Outcome Type:" + str(outcome_type))

# Create an instance of the ModelJob class with the loaded parameters
job_obj = ModelJob(full_path, output_path, experiment_name, cv_count, outcome_label,
instance_label, scoring_metric, metric_direction, n_trials,
timeout, training_subsample, uniform_fi, save_plots, random_state)


# Initialize the model based on the specified algorithm
if algorithm not in ['eLCS', 'XCS', 'ExSTraCS']:
# Standard model initialization
model = model_str_to_obj(algorithm)(cv_folds=3,
scoring_metric=scoring_metric,
metric_direction=metric_direction,
random_state=random_state,
cv=None, n_jobs=n_jobs)
else:
# Special handling for LCS algorithms
if algorithm == 'ExSTraCS':
# Get expert knowledge for ExSTraCS
expert_knowledge = get_fi_for_ExSTraCS(output_path, experiment_name,
dataset_directory_path,
outcome_label, instance_label, cv_count,
filter_poor_features)
if do_lcs_sweep:
# Initialize ExSTraCS with LCS sweep
model = model_str_to_obj(algorithm)(cv_folds=3,
scoring_metric=scoring_metric,
metric_direction=metric_direction,
random_state=random_state,
cv=None, n_jobs=n_jobs,
expert_knowledge=expert_knowledge)
else:
# Initialize ExSTraCS with specific parameters
model = model_str_to_obj(algorithm)(cv_folds=3,
scoring_metric=scoring_metric,
metric_direction=metric_direction,
Expand All @@ -87,6 +105,7 @@ def run_cluster(argv):
N=lcs_n, nu=lcs_nu,
expert_knowledge=expert_knowledge)
else:
# Initialize other LCS models
if do_lcs_sweep:
model = model_str_to_obj(algorithm)(cv_folds=3,
scoring_metric=scoring_metric,
Expand All @@ -101,8 +120,10 @@ def run_cluster(argv):
cv=None, n_jobs=n_jobs,
iterations=lcs_iterations,
N=lcs_n, nu=lcs_nu)
job_obj.run(model)

# Run the model job with the initialized model
job_obj.run(model)

# If the script is executed directly, run the run_cluster function with command line arguments
if __name__ == "__main__":
sys.exit(run_cluster(sys.argv))
Loading

0 comments on commit fb67832

Please sign in to comment.