Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dict comprehensions, min(max()), or and more #98

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 39 additions & 77 deletions tpot/tpot.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
from deap import tools
from deap import gp


class TPOT(object):
"""TPOT automatically creates and optimizes machine learning pipelines using genetic programming."""

Expand Down Expand Up @@ -146,11 +147,7 @@ def __init__(self, population_size=100, generations=100,
self._toolbox.register('mate', gp.cxOnePoint)
self._toolbox.register('expr_mut', gp.genFull, min_=0, max_=3)
self._toolbox.register('mutate', self._random_mutation_operator)

if not scoring_function:
self.scoring_function = self._balanced_accuracy
else:
self.scoring_function = scoring_function
self.scoring_function = scoring_function or self._balanced_accuracy
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm unsure about changes like this because it makes the code less explicit and harder to read. In fact, I'm not even sure if this line works? I'm assuming it does if you submitted the change.


def fit(self, features, classes, feature_names=None):
"""Fits a machine learning pipeline that maximizes classification accuracy on the provided data
Expand Down Expand Up @@ -183,10 +180,8 @@ def fit(self, features, classes, feature_names=None):
training_testing_data = pd.DataFrame(data=features, columns=feature_names)
training_testing_data['class'] = classes

new_col_names = {}
for column in training_testing_data.columns.values:
if type(column) != str:
new_col_names[column] = str(column).zfill(10)
new_col_names = {column: str(column).zfill(10) for column
in training_testing_data.columns.values if type(column) != str}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same readability concern as above wrt list comprehensions. This seems harder to read, and I'm not sure if we gain anything by it?

training_testing_data.rename(columns=new_col_names, inplace=True)

# Randomize the order of the columns so there is no potential bias introduced by the initial order
Expand Down Expand Up @@ -302,10 +297,8 @@ def predict(self, testing_features):
most_frequent_class = Counter(self._training_classes).most_common(1)[0][0]
training_testing_data.loc[:, 'guess'] = most_frequent_class

new_col_names = {}
for column in training_testing_data.columns.values:
if type(column) != str:
new_col_names[column] = str(column).zfill(10)
new_col_names = {column: str(column).zfill(10) for column
in training_testing_data.columns.values if type(column) != str}
training_testing_data.rename(columns=new_col_names, inplace=True)

# Transform the tree expression in a callable function
Expand Down Expand Up @@ -354,7 +347,6 @@ def score(self, testing_features, testing_classes):

return self._evaluate_individual(self._optimized_pipeline, training_testing_data)[1]


def export(self, output_file_name):
"""Exports the current optimized pipeline as Python code

Expand Down Expand Up @@ -411,8 +403,8 @@ def _decision_tree(self, input_df, max_features, max_depth):
max_features = 'auto'
elif max_features == 1:
max_features = None
elif max_features > len(input_df.columns) - 3:
max_features = len(input_df.columns) - 3
else:
max_features = min(max_features, len(input_df.columns) - 3)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 to changes like these where if branches are replaced with min and max. I would merge a PR with only those changes right now.


if max_depth < 1:
max_depth = None
Expand All @@ -438,17 +430,14 @@ def _random_forest(self, input_df, n_estimators, max_features):
Also adds the classifiers's predictions as a 'SyntheticFeature' column.

"""
if n_estimators < 1:
n_estimators = 1
elif n_estimators > 500:
n_estimators = 500
n_estimators = min(max(n_estimators, 1), 500)

if max_features < 1:
max_features = 'auto'
elif max_features == 1:
max_features = None
elif max_features > len(input_df.columns) - 3:
max_features = len(input_df.columns) - 3
else:
max_features = min(max_features, len(input_df.columns) - 3)

return self._train_model_and_predict(input_df, RandomForestClassifier, n_estimators=n_estimators, max_features=max_features, random_state=42, n_jobs=-1)

Expand Down Expand Up @@ -496,7 +485,6 @@ def _svc(self, input_df, C):

return self._train_model_and_predict(input_df, SVC, C=C, random_state=42)


def _knnc(self, input_df, n_neighbors):
"""Fits a k-nearest neighbor classifier

Expand Down Expand Up @@ -547,11 +535,8 @@ def _xgradient_boosting(self, input_df, learning_rate, n_estimators, max_depth):
if learning_rate <= 0.:
learning_rate = 0.0001

if n_estimators < 1:
n_estimators = 1
elif n_estimators > 500:
n_estimators = 500

n_estimators = min(max(n_estimators, 1), 500)

if max_depth < 1:
max_depth = None

Expand Down Expand Up @@ -588,7 +573,7 @@ def _train_model_and_predict(self, input_df, model, **kwargs):

# Try to seed the random_state parameter if the model accepts it.
try:
clf = model(random_state=42,**kwargs)
clf = model(random_state=42, **kwargs)
clf.fit(training_features, training_classes)
except TypeError:
clf = model(**kwargs)
Expand Down Expand Up @@ -624,7 +609,8 @@ def _combine_dfs(input_df1, input_df2):
Returns a DataFrame containing the features of both input_df1 and input_df2

"""
return input_df1.join(input_df2[[column for column in input_df2.columns.values if column not in input_df1.columns.values]]).copy()
return input_df1.join(input_df2[[column for column in input_df2.columns.values
if column not in input_df1.columns.values]]).copy()

def _rfe(self, input_df, num_features, step):
"""Uses scikit-learn's Recursive Feature Elimination to learn the subset of features that have the highest weights according to the estimator
Expand All @@ -651,11 +637,7 @@ def _rfe(self, input_df, num_features, step):
step = 0.1
elif step >= 1.:
step = 0.99
if num_features < 1:
num_features = 1
elif num_features > len(training_features.columns):
num_features = len(training_features.columns)

num_features = min(max(n_features, 1), len(training_features.columns))
if len(training_features.columns.values) == 0:
return input_df.copy()

Expand Down Expand Up @@ -688,12 +670,8 @@ def _select_percentile(self, input_df, percentile):
training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1)
training_class_vals = input_df.loc[input_df['group'] == 'training', 'class'].values

if percentile < 0:
percentile = 0
elif percentile > 100:
percentile = 100

if len(training_features.columns.values) == 0:
percentile = min(max(percentile, 0), 100)
if not training_features.columns.values:
return input_df.copy()

with warnings.catch_warnings():
Expand Down Expand Up @@ -723,6 +701,7 @@ def _select_kbest(self, input_df, k):
Returns a DataFrame containing the `k` best features

"""

training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1)
training_class_vals = input_df.loc[input_df['group'] == 'training', 'class'].values

Expand All @@ -731,7 +710,7 @@ def _select_kbest(self, input_df, k):
elif k >= len(training_features.columns):
k = 'all'

if len(training_features.columns.values) == 0:
if not training_features.columns.values:
return input_df.copy()

with warnings.catch_warnings():
Expand Down Expand Up @@ -765,13 +744,9 @@ def _select_fwe(self, input_df, alpha):
training_class_vals = input_df.loc[input_df['group'] == 'training', 'class'].values

# forcing 0.001 <= alpha <= 0.05
if alpha > 0.05:
alpha = 0.05
elif alpha <= 0.001:
alpha = 0.001

alpha = min(max(alpha, 0.001), 0.05)

if len(training_features.columns.values) == 0:
if not training_features.columns.values:
return input_df.copy()

with warnings.catch_warnings():
Expand Down Expand Up @@ -858,17 +833,18 @@ def _robust_scaler(self, input_df):
Returns a DataFrame containing the scaled features

"""
training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1)
class_group_guess = ['class', 'group', 'guess']
training_features = input_df.loc[input_df['group'] == 'training'].drop(class_group_guess, axis=1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 to this type of change.


if len(training_features.columns.values) == 0:
return input_df.copy()

# The scaler must be fit on only the training data
scaler = RobustScaler()
scaler.fit(training_features.values.astype(np.float64))
scaled_features = scaler.transform(input_df.drop(['class', 'group', 'guess'], axis=1).values.astype(np.float64))
scaled_features = scaler.transform(input_df.drop(class_group_guess, axis=1).values.astype(np.float64))

for col_num, column in enumerate(input_df.drop(['class', 'group', 'guess'], axis=1).columns.values):
for col_num, column in enumerate(input_df.drop(class_group_guess, axis=1).columns.values):
input_df.loc[:, column] = scaled_features[:, col_num]

return input_df.copy()
Expand All @@ -890,7 +866,7 @@ def _polynomial_features(self, input_df):

training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1)

if len(training_features.columns.values) == 0:
if not training_features.columns.values:
return input_df.copy()
elif len(training_features.columns.values) > 700:
# Too many features to produce - skip this operator
Expand All @@ -906,10 +882,8 @@ def _polynomial_features(self, input_df):
modified_df['group'] = input_df['group'].values
modified_df['guess'] = input_df['guess'].values

new_col_names = {}
for column in modified_df.columns.values:
if type(column) != str:
new_col_names[column] = str(column).zfill(10)
new_col_names = {column: str(column).zfill(10) for column
in modified_df.columns.values if type(column) != str}
modified_df.rename(columns=new_col_names, inplace=True)

return modified_df.copy()
Expand Down Expand Up @@ -940,15 +914,10 @@ def _pca(self, input_df, n_components, iterated_power):
n_components = None

#Thresholding iterated_power [1,10]
if iterated_power < 1:
iterated_power = 1
elif iterated_power > 10:
iterated_power = 10


iterated_power = min(max(iterated_power, 1), 10)
training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1)

if len(training_features.columns.values) == 0:
if not training_features.columns.values:
return input_df.copy()

# PCA must be fit on only the training data
Expand All @@ -961,10 +930,8 @@ def _pca(self, input_df, n_components, iterated_power):
modified_df['group'] = input_df['group'].values
modified_df['guess'] = input_df['guess'].values

new_col_names = {}
for column in modified_df.columns.values:
if type(column) != str:
new_col_names[column] = str(column).zfill(10)
new_col_names = {column: str(column).zfill(10) for column
in modified_df.columns.values if type(column) != str}
modified_df.rename(columns=new_col_names, inplace=True)

return modified_df.copy()
Expand All @@ -986,10 +953,7 @@ def _div(num1, num2):
Returns num1 / num2, or 0 if num2 == 0

"""
if num2 == 0:
return 0.

return float(num1) / float(num2)
return float(num1) / float(num2) if num2 else 0.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe more explicit as return float(num1) / float(num2) if num2 != 0. else 0.?


def _evaluate_individual(self, individual, training_testing_data):
"""Determines the `individual`'s fitness according to its performance on the provided data
Expand Down Expand Up @@ -1105,6 +1069,7 @@ def _random_mutation_operator(self, individual):
else:
return gp.mutShrink(individual)


def main():
"""Main function that is called when TPOT is run on the command line"""
from _version import __version__
Expand Down Expand Up @@ -1151,7 +1116,7 @@ def float_range(value):
value = float(value)
except:
raise argparse.ArgumentTypeError('invalid float value: \'{}\''.format(value))
if value < 0.0 or value > 1.0:
if not 0.0 <= value <= 1.0:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👎

raise argparse.ArgumentTypeError('invalid float value: \'{}\''.format(value))
return value

Expand Down Expand Up @@ -1196,10 +1161,7 @@ def float_range(value):
if 'Class' in input_data.columns.values:
input_data.rename(columns={'Class': 'class'}, inplace=True)

if args.RANDOM_STATE > 0:
RANDOM_STATE = args.RANDOM_STATE
else:
RANDOM_STATE = None
RANDOM_STATE = args.RANDOM_STATE if args.RANDOM_STATE > 0 else None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍


training_indices, testing_indices = next(iter(StratifiedShuffleSplit(input_data['class'].values,
n_iter=1,
Expand All @@ -1223,7 +1185,7 @@ def float_range(value):
print('\nTraining accuracy: {}'.format(tpot.score(training_features, training_classes)))
print('Testing accuracy: {}'.format(tpot.score(testing_features, testing_classes)))

if args.OUTPUT_FILE != '':
if args.OUTPUT_FILE:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👎

tpot.export(args.OUTPUT_FILE)


Expand Down