Skip to content

Commit

Permalink
Corrected all minus report
Browse files Browse the repository at this point in the history
  • Loading branch information
raptor419 committed May 14, 2024
1 parent 8a912fc commit cbb01d7
Show file tree
Hide file tree
Showing 23 changed files with 344 additions and 317 deletions.
20 changes: 16 additions & 4 deletions streamline/dataprep/data_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,19 @@ def __init__(self, dataset, experiment_path, ignore_features=None,
"""
super().__init__()
if type(dataset) != Dataset:
raise (Exception("dataset input is not of type Dataset"))
self.dataset = dataset
self.outcome_type = dataset.outcome_type
self.dataset_path = dataset.path
try:
assert(type(dataset) == tuple)
dataset_path, outcome_label, match_label, instance_label, outcome_type = dataset
self.dataset = Dataset(dataset_path, outcome_label, match_label, instance_label, outcome_type, load_data=False)
self.load_data = False
self.dataset_path = dataset.path
except Exception as e:
raise (Exception("dataset input is invalid " + str(e)))
else:
self.dataset = dataset
self.outcome_type = dataset.outcome_type
self.dataset_path = dataset.path
self.load_data = True
self.experiment_path = experiment_path
self.random_state = random_state

Expand Down Expand Up @@ -164,6 +173,9 @@ def run(self, top_features=20):
"""
self.job_start_time = time.time()
if not self.load_data:
self.dataset.load_data()
self.outcome_type = self.dataset.outcome_type

# Conduct Exploratory Analysis, Data Cleaning, and Feature Engineering
self.run_process(top_features)
Expand Down
12 changes: 4 additions & 8 deletions streamline/legacy/CompareJobSubmit.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,10 @@


def run_cluster(argv):
output_path = argv[1]
experiment_name = argv[2]
experiment_path = argv[3] if argv[3] != "None" else None
outcome_type = argv[4]
outcome_label = argv[5]
instance_label = argv[6] if argv[6] != "None" else None
sig_cutoff = float(argv[7])
show_plots = eval(argv[8])
param_path = argv[1]
with open(param_path, "rb") as input_file:
params = pickle.load(input_file)
globals().update(params)

job_obj = CompareJob(output_path, experiment_name, experiment_path,
outcome_label, outcome_type, instance_label, sig_cutoff, show_plots)
Expand Down
17 changes: 7 additions & 10 deletions streamline/legacy/DataJobSubmit.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import sys
import pickle
from pathlib import Path

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
Expand All @@ -9,16 +10,12 @@


def run_cluster(argv):
cv_train_path = argv[1]
cv_test_path = argv[2]
full_path = argv[3]
scale_data = eval(argv[4])
impute_data = eval(argv[5])
multi_impute = eval(argv[6])
overwrite_cv = eval(argv[7])
outcome_label = argv[8] if argv[8] != "None" else None
instance_label = argv[9] if argv[9] != "None" else None
random_state = int(argv[10]) if argv[10] != "None" else None
param_path = argv[1]
with open(param_path, "rb") as input_file:
params = pickle.load(input_file)
globals().update(params)
full_path = output_path + "/" + experiment_name


job_obj = ScaleAndImpute(cv_train_path, cv_test_path,
full_path,
Expand Down
44 changes: 42 additions & 2 deletions streamline/legacy/EDAJobSubmit.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,61 @@
from streamline.utils.dataset import Dataset
from streamline.dataprep.data_process import DataProcess

class dotdict(dict):
def __getattr__(self, key):
return self[key]

def __setattr__(self, key, value):
self[key] = value

def save_metadata(self):
metadata = dict()
metadata['Data Path'] = self.data_path
metadata['Output Path'] = self.output_path
metadata['Experiment Name'] = self.experiment_name
metadata['Outcome Label'] = self.outcome_label
metadata['Outcome Type'] = self.outcome_type
metadata['Instance Label'] = self.instance_label
metadata['Match Label'] = self.match_label
metadata['Ignored Features'] = self.ignore_features
metadata['Specified Categorical Features'] = self.categorical_features
metadata['Specified Quantitative Features'] = self.quantitative_features
metadata['CV Partitions'] = self.n_splits
metadata['Partition Method'] = self.partition_method
metadata['Categorical Cutoff'] = self.categorical_cutoff
metadata['Statistical Significance Cutoff'] = self.sig_cutoff
metadata['Engineering Missingness Cutoff'] = self.featureeng_missingness
metadata['Cleaning Missingness Cutoff'] = self.cleaning_missingness
metadata['Correlation Removal Threshold'] = self.correlation_removal_threshold
metadata['List of Exploratory Analysis Ran'] = self.exploration_list
metadata['List of Exploratory Plots Saved'] = self.plot_list
metadata['Random Seed'] = self.random_state
metadata['Run From Notebook'] = self.show_plots
# Pickle the metadata for future use
pickle_out = open(self.output_path + '/' + self.experiment_name + '/' + "metadata.pickle", 'wb')
pickle.dump(metadata, pickle_out)
pickle_out.close()


def run_cluster(argv):
param_path = argv[1]
with open(param_path, "rb") as input_file:
params = pickle.load(input_file)
globals().update(params)
try:
dataset = Dataset(dataset_path, outcome_label, match_label, instance_label, outcome_type)
except Exception:
dataset = (dataset_path, outcome_label, match_label, instance_label, outcome_type)
save_metadata(dotdict(params))


dataset = Dataset(dataset_path, outcome_label, match_label, instance_label, outcome_type)
eda_obj = DataProcess(dataset, output_path + '/' + experiment_name,
ignore_features,
categorical_features, quantitative_features, exclude_eda_output,
categorical_cutoff, sig_cutoff, featureeng_missingness,
cleaning_missingness, correlation_removal_threshold, partition_method, n_splits,
random_state)
eda_obj.run(top_features)



if __name__ == "__main__":
Expand Down
15 changes: 5 additions & 10 deletions streamline/legacy/FImpJobSubmit.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import sys
import pickle
from pathlib import Path

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
Expand All @@ -9,16 +10,10 @@


def run_cluster(argv):
cv_train_path = argv[1]
experiment_path = argv[2]
outcome_label = argv[3]
instance_label = argv[4] if argv[4] != "None" else None
instance_subset = None if argv[5] == "None" else eval(argv[5])
algorithm = argv[6]
use_turf = eval(argv[7])
turf_pct = eval(argv[8])
random_state = None if argv[9] == "None" else int(argv[9])
n_jobs = None if argv[10] == "None" else int(argv[10])
param_path = argv[1]
with open(param_path, "rb") as input_file:
params = pickle.load(input_file)
globals().update(params)

job_obj = FeatureImportance(cv_train_path, experiment_path, outcome_label,
instance_label, instance_subset, algorithm,
Expand Down
17 changes: 5 additions & 12 deletions streamline/legacy/FSelJobSubmit.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import sys
import pickle
from pathlib import Path

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
Expand All @@ -9,18 +10,10 @@


def run_cluster(argv):
full_path = argv[1]
n_datasets = int(argv[2])
MI, MS = "MI", "MS"
algorithms = None if argv[3] == "None" else eval(argv[3])
print(algorithms)
outcome_label = argv[4]
instance_label = argv[5] if argv[5] != "None" else None
export_scores = eval(argv[6])
top_features = int(argv[7])
max_features_to_keep = int(argv[8])
filter_poor_features = eval(argv[9])
overwrite_cv = eval(argv[10])
param_path = argv[1]
with open(param_path, "rb") as input_file:
params = pickle.load(input_file)
globals().update(params)

job_obj = FeatureSelection(full_path, n_datasets, algorithms,
outcome_label, instance_label, export_scores,
Expand Down
61 changes: 37 additions & 24 deletions streamline/legacy/ModelJobSubmit.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,43 +7,56 @@
sys.path.append(str(Path(SCRIPT_DIR).parent.parent))

from streamline.modeling.modeljob import ModelJob
from streamline.modeling.utils import model_str_to_obj
from streamline.modeling.utils import get_fi_for_ExSTraCS


def run_cluster(argv):
full_path = argv[1]
output_path = argv[2]
experiment_name = argv[3]
cv_count = int(argv[4])
outcome_label = argv[5]
outcome_type = argv[6]
instance_label = argv[7] if argv[7] != "None" else None
scoring_metric = argv[8]
metric_direction = argv[9]
n_trials = int(argv[10])
timeout = int(argv[11])
training_subsample = int(argv[12])
uniform_fi = eval(argv[13])
save_plot = eval(argv[14])
random_state = None if argv[15] == "None" else int(argv[15])
algorithm = argv[16]
n_jobs = None if argv[17] == "None" else int(argv[17])
do_lcs_sweep = eval(argv[18])
lcs_iterations = int(argv[19])
lcs_n = int(argv[20])
lcs_nu = int(argv[21])
param_path = argv[1]
with open(param_path, "rb") as input_file:
params = pickle.load(input_file)
globals().update(params)
print(params)
print(vars())

# if outcome_type == "Binary":
# with GlobalImport() as gi:
# from streamline.modeling.classification_utils import model_str_to_obj
# gi()

# elif outcome_type == "Continuous":
# if scoring_metric == 'balanced_accuracy':
# scoring_metric = 'explained_variance'
# with GlobalImport() as gi:
# from streamline.modeling.regression_utils import model_str_to_obj
# gi()
# elif outcome_type == "Multiclass":
# # logging.info("Using Multiclass Classification Models")
# with GlobalImport() as gi:
# from streamline.modeling.multiclass_utils import model_str_to_obj
# gi()
# else:
# raise Exception("Unknown Outcome Type:" + str(outcome_type))

file = open(output_path + '/' + experiment_name + '/' + "metadata.pickle", 'rb')
metadata = pickle.load(file)
filter_poor_features = metadata['Filter Poor Features']
outcome_type = metadata['Outcome Type']
file.close()

dataset_directory_path = full_path.split('/')[-1]

if outcome_type == "Binary":
from streamline.modeling.classification_utils import model_str_to_obj
elif outcome_type == "Multiclass":
from streamline.modeling.multiclass_utils import model_str_to_obj
elif outcome_type == "Continuous":
from streamline.modeling.regression_utils import model_str_to_obj
else:
raise Exception("Unknown Outcome Type:" + str(outcome_type))

job_obj = ModelJob(full_path, output_path, experiment_name, cv_count, outcome_label,
instance_label, scoring_metric, metric_direction, n_trials,
timeout, training_subsample, uniform_fi, save_plot, random_state)
timeout, training_subsample, uniform_fi, save_plots, random_state)


if algorithm not in ['eLCS', 'XCS', 'ExSTraCS']:
model = model_str_to_obj(algorithm)(cv_folds=3,
Expand Down
55 changes: 18 additions & 37 deletions streamline/legacy/RepJobSubmit.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,45 +10,26 @@


def run_cluster(argv):
dataset_filename = argv[1]
dataset_for_rep = argv[2]
full_path = argv[3]
outcome_label = argv[4]
outcome_type = argv[5]
instance_label = argv[6] if argv[6] != "None" else None
match_label = argv[7] if argv[7] != "None" else None
experiment_path = '/'.join(full_path.split('/')[:-1])
file = open(experiment_path + '/' + "algInfo.pickle", 'rb')
alg_info = pickle.load(file)
file.close()
temp_algo = []
for key in alg_info:
if alg_info[key][0]:
temp_algo.append(key)
algorithms = temp_algo
file = open(experiment_path + '/' + "metadata.pickle", 'rb')
metadata = pickle.load(file)
file.close()
ignore_features = metadata['Ignored Features']
exclude = None
len_cv = int(argv[9])
if argv != 'None':
exclude_options = argv[10].split(',')
exclude_options = [x.strip() for x in exclude_options]
else:
exclude_options = None
categorical_cutoff = int(argv[11]) if argv[11] != "None" else None
sig_cutoff = float(argv[12]) if argv[12] != "None" else None
scale_data = eval(argv[13])
impute_data = eval(argv[14])
multi_impute = eval(argv[15])
show_plots = eval(argv[16])
scoring_metric = argv[17]
random_state = eval(argv[18])
param_path = argv[1]
with open(param_path, "rb") as input_file:
params = pickle.load(input_file)
globals().update(params)
# file = open(experiment_path + '/' + "algInfo.pickle", 'rb')
# alg_info = pickle.load(file)
# file.close()
# temp_algo = []
# for key in alg_info:
# if alg_info[key][0]:
# temp_algo.append(key)
# algorithms = temp_algo
# file = open(experiment_path + '/' + "metadata.pickle", 'rb')
# metadata = pickle.load(file)
# file.close()
# ignore_features = metadata['Ignored Features']

job_obj = ReplicateJob(dataset_filename, dataset_for_rep, full_path, outcome_label, outcome_type, instance_label,
match_label, ignore_features, len_cv,
exclude_options,
match_label, ignore_features, cv_partitions,
exclude_plots,
categorical_cutoff, sig_cutoff, scale_data, impute_data,
multi_impute, show_plots, scoring_metric, random_state)
job_obj.run()
Expand Down
10 changes: 4 additions & 6 deletions streamline/legacy/ReportJobSubmit.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,10 @@


def run_cluster(argv):
output_path = argv[1]
experiment_name = argv[2]
experiment_path = None
training = eval(argv[3])
train_data_path = None if argv[4] == "None" else argv[4]
rep_data_path = None if argv[5] == "None" else argv[5]
param_path = argv[1]
with open(param_path, "rb") as input_file:
params = pickle.load(input_file)
globals().update(params)

job_obj = ReportJob(output_path, experiment_name, experiment_path,
training, train_data_path, rep_data_path)
Expand Down
23 changes: 5 additions & 18 deletions streamline/legacy/StatsJobSubmit.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,14 @@


def run_cluster(argv):
full_path = argv[1]
experiment_path = '/'.join(full_path.split('/')[:-1])
outcome_label = argv[2]
outcome_type = argv[3]
instance_label = argv[4] if argv[4] != "None" else None
scoring_metric = argv[5]
len_cv = int(argv[6])
top_features = int(argv[7]) if argv[7] != "None" else None
sig_cutoff = float(argv[8]) if argv[8] != "None" else None
metric_weight = argv[9] if argv[9] != "None" else None
scale_data = eval(argv[10])
if argv[11] != 'None':
exclude_options = argv[11].split(',')
exclude_options = [x.strip() for x in exclude_options]
else:
exclude_options = None
show_plots = eval(argv[12])
param_path = argv[1]
with open(param_path, "rb") as input_file:
params = pickle.load(input_file)
globals().update(params)

job_obj = StatsJob(full_path, outcome_label, outcome_type, instance_label, scoring_metric,
len_cv, top_features, sig_cutoff, metric_weight, scale_data,
exclude_options,
exclude_plots,
show_plots)
job_obj.run()

Expand Down
Loading

0 comments on commit cbb01d7

Please sign in to comment.