Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for pandas DataFrame as input in fit() #684

Merged
merged 2 commits into from
Apr 9, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions tests/tpot_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from tpot.config.classifier_sparse import classifier_config_sparse

import numpy as np
import pandas as pd
from scipy import sparse
import inspect
import random
Expand Down Expand Up @@ -91,6 +92,16 @@ def closing(arg):
training_features_r, testing_features_r, training_target_r, testing_target_r = \
train_test_split(boston_data.data, boston_data.target, random_state=42)

# Set up pandas DataFrame for testing

input_data = pd.read_csv(
'tests/tests.csv',
sep=',',
dtype=np.float64,
)
pd_features = input_data.drop('class', axis=1)
pd_target = input_data['class']

# Set up the sparse matrix for testing
sparse_features = sparse.csr_matrix(training_features)
sparse_target = training_target
Expand Down Expand Up @@ -751,6 +762,23 @@ def test_fit_4():
assert not (tpot_obj._start_datetime is None)


def test_fit_5():
"""Assert that the TPOT fit function provides an optimized pipeline with pandas DataFrame"""
tpot_obj = TPOTClassifier(
random_state=42,
population_size=1,
offspring_size=2,
generations=1,
verbosity=0
)

tpot_obj.fit(pd_features, pd_target)

assert isinstance(pd_features, pd.DataFrame)
assert isinstance(tpot_obj._optimized_pipeline, creator.Individual)
assert not (tpot_obj._start_datetime is None)


def test_memory():
"""Assert that the TPOT fit function runs normally with memory=\'auto\'."""
tpot_obj = TPOTClassifier(
Expand Down
46 changes: 23 additions & 23 deletions tpot/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,29 +538,8 @@ def fit(self, features, target, sample_weight=None, groups=None):
Returns a copy of the fitted TPOT object

"""
features = features.astype(np.float64)

# Resets the imputer to be fit for the new dataset
self._fitted_imputer = None
self._imputed = False
# If features is a sparse matrix, do not apply imputation
if sparse.issparse(features):
if self.config_dict_params in [None, "TPOT light", "TPOT MDR"]:
raise ValueError(
'Not all operators in {} supports sparse matrix. '
'Please use \"TPOT sparse\" for sparse matrix.'.format(self.config_dict_params)
)
elif self.config_dict_params != "TPOT sparse":
print(
'Warning: Since the input matrix is a sparse matrix, please makes sure all the operators in the '
'customized config dictionary supports sparse matriies.'
)
else:
if np.any(np.isnan(features)):
self._imputed = True
features = self._impute_values(features)

self._check_dataset(features, target)
features, target = self._check_dataset(features, target)

# Randomly collect a subsample of training samples for pipeline optimization process.
if self.subsample < 1.0:
Expand Down Expand Up @@ -1024,8 +1003,28 @@ def _check_dataset(self, features, target):
-------
None
"""
# Resets the imputer to be fit for the new dataset
self._fitted_imputer = None
self._imputed = False
# If features is a sparse matrix, do not apply imputation
if sparse.issparse(features):
if self.config_dict_params in [None, "TPOT light", "TPOT MDR"]:
raise ValueError(
'Not all operators in {} supports sparse matrix. '
'Please use \"TPOT sparse\" for sparse matrix.'.format(self.config_dict_params)
)
elif self.config_dict_params != "TPOT sparse":
print(
'Warning: Since the input matrix is a sparse matrix, please makes sure all the operators in the '
'customized config dictionary supports sparse matriies.'
)
else:
if np.any(np.isnan(features)):
self._imputed = True
features = self._impute_values(features)
try:
check_X_y(features, target, accept_sparse=True)
X, y = check_X_y(features, target, accept_sparse=True, dtype=np.float64)
return X, y
except (AssertionError, ValueError):
raise ValueError(
'Error: Input data is not in a valid format. Please confirm '
Expand All @@ -1034,6 +1033,7 @@ def _check_dataset(self, features, target):
'1-D array.'
)


def _compile_to_sklearn(self, expr):
"""Compile a DEAP pipeline into a sklearn pipeline.

Expand Down
5 changes: 3 additions & 2 deletions tpot/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,10 +499,11 @@ def tpot_driver(args):
_print_args(args)

input_data = _read_data_file(args)
features = input_data.drop(args.TARGET_NAME, axis=1).values
features = input_data.drop(args.TARGET_NAME, axis=1)

training_features, testing_features, training_target, testing_target = \
train_test_split(features, input_data[args.TARGET_NAME].values, random_state=args.RANDOM_STATE)
train_test_split(features, input_data[args.TARGET_NAME], random_state=args.RANDOM_STATE)


tpot_type = TPOTClassifier if args.TPOT_MODE == 'classification' else TPOTRegressor

Expand Down