EpistasisLab · weixuanfu · Apr 9, 2018 · Apr 9, 2018 · Apr 9, 2018
diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py
@@ -41,6 +41,7 @@
 from tpot.config.classifier_sparse import classifier_config_sparse
 
 import numpy as np
+import pandas as pd
 from scipy import sparse
 import inspect
 import random
@@ -91,6 +92,16 @@ def closing(arg):
 training_features_r, testing_features_r, training_target_r, testing_target_r = \
     train_test_split(boston_data.data, boston_data.target, random_state=42)
 
+# Set up pandas DataFrame for testing
+
+input_data = pd.read_csv(
+    'tests/tests.csv',
+    sep=',',
+    dtype=np.float64,
+)
+pd_features = input_data.drop('class', axis=1)
+pd_target = input_data['class']
+
 # Set up the sparse matrix for testing
 sparse_features = sparse.csr_matrix(training_features)
 sparse_target = training_target
@@ -751,6 +762,23 @@ def test_fit_4():
     assert not (tpot_obj._start_datetime is None)
 
 
+def test_fit_5():
+    """Assert that the TPOT fit function provides an optimized pipeline with pandas DataFrame"""
+    tpot_obj = TPOTClassifier(
+        random_state=42,
+        population_size=1,
+        offspring_size=2,
+        generations=1,
+        verbosity=0
+    )
+
+    tpot_obj.fit(pd_features, pd_target)
+
+    assert isinstance(pd_features, pd.DataFrame)
+    assert isinstance(tpot_obj._optimized_pipeline, creator.Individual)
+    assert not (tpot_obj._start_datetime is None)
+
+
 def test_memory():
     """Assert that the TPOT fit function runs normally with memory=\'auto\'."""
     tpot_obj = TPOTClassifier(

diff --git a/tpot/base.py b/tpot/base.py
@@ -538,29 +538,8 @@ def fit(self, features, target, sample_weight=None, groups=None):
             Returns a copy of the fitted TPOT object
 
         """
-        features = features.astype(np.float64)
-
-        # Resets the imputer to be fit for the new dataset
-        self._fitted_imputer = None
-        self._imputed = False
-        # If features is a sparse matrix, do not apply imputation
-        if sparse.issparse(features):
-            if self.config_dict_params in [None, "TPOT light", "TPOT MDR"]:
-                raise ValueError(
-                    'Not all operators in {} supports sparse matrix. '
-                    'Please use \"TPOT sparse\" for sparse matrix.'.format(self.config_dict_params)
-                )
-            elif self.config_dict_params != "TPOT sparse":
-                print(
-                    'Warning: Since the input matrix is a sparse matrix, please makes sure all the operators in the '
-                    'customized config dictionary supports sparse matriies.'
-                )
-        else:
-            if np.any(np.isnan(features)):
-                self._imputed = True
-                features = self._impute_values(features)
 
-        self._check_dataset(features, target)
+        features, target = self._check_dataset(features, target)
 
         # Randomly collect a subsample of training samples for pipeline optimization process.
         if self.subsample < 1.0:
@@ -1024,8 +1003,28 @@ def _check_dataset(self, features, target):
         -------
         None
         """
+        # Resets the imputer to be fit for the new dataset
+        self._fitted_imputer = None
+        self._imputed = False
+        # If features is a sparse matrix, do not apply imputation
+        if sparse.issparse(features):
+            if self.config_dict_params in [None, "TPOT light", "TPOT MDR"]:
+                raise ValueError(
+                    'Not all operators in {} supports sparse matrix. '
+                    'Please use \"TPOT sparse\" for sparse matrix.'.format(self.config_dict_params)
+                )
+            elif self.config_dict_params != "TPOT sparse":
+                print(
+                    'Warning: Since the input matrix is a sparse matrix, please makes sure all the operators in the '
+                    'customized config dictionary supports sparse matriies.'
+                )
+        else:
+            if np.any(np.isnan(features)):
+                self._imputed = True
+                features = self._impute_values(features)
         try:
-            check_X_y(features, target, accept_sparse=True)
+            X, y = check_X_y(features, target, accept_sparse=True, dtype=np.float64)
+            return X, y
         except (AssertionError, ValueError):
             raise ValueError(
                 'Error: Input data is not in a valid format. Please confirm '
@@ -1034,6 +1033,7 @@ def _check_dataset(self, features, target):
                 '1-D array.'
             )
 
+
     def _compile_to_sklearn(self, expr):
         """Compile a DEAP pipeline into a sklearn pipeline.
 

diff --git a/tpot/driver.py b/tpot/driver.py
@@ -499,10 +499,11 @@ def tpot_driver(args):
         _print_args(args)
 
     input_data = _read_data_file(args)
-    features = input_data.drop(args.TARGET_NAME, axis=1).values
+    features = input_data.drop(args.TARGET_NAME, axis=1)
 
     training_features, testing_features, training_target, testing_target = \
-        train_test_split(features, input_data[args.TARGET_NAME].values, random_state=args.RANDOM_STATE)
+        train_test_split(features, input_data[args.TARGET_NAME], random_state=args.RANDOM_STATE)
+
 
     tpot_type = TPOTClassifier if args.TPOT_MODE == 'classification' else TPOTRegressor