scikit-learn-contrib · dukebody · Mar 20, 2016 · Mar 20, 2016 · Mar 20, 2016
diff --git a/README.rst b/README.rst
@@ -102,6 +102,17 @@ Now that the transformation is trained, we confirm that it works on new data::
     >>> np.round(mapper.transform(sample), 2)
     array([[ 1.  ,  0.  ,  0.  ,  1.04]])
 
+After transformation, the ``feature_indices_` attribute of the mapper
+indicates which columns of the resulting output array correspond to which
+input features. Input feature ``i`` is mapped to features from
+``feature_indices_[i]`` to ``feature_indices_[i+1]`` in transformed output.
+For example:
+
+    >>> mapper.feature_indices_[0], mapper.feature_indices_[1] # pet
+    (0, 3)
+    >>> mapper.feature_indices_[1], mapper.feature_indices_[2]  # children
+    (3, 4)
+
 Transform Multiple Columns
 **************************
 
@@ -195,6 +206,8 @@ Development
 
 * Deprecate custom cross-validation shim classes.
 * Require ``scikit-learn>=0.15.0``. Resolves #49.
+* Add ``feature_indices_`` attribute indicating the mapping between input and
+  ouptut variables.
 
 
 1.1.0 (2015-12-06)

diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py
@@ -29,21 +29,32 @@ def _build_transformer(transformers):
 
 class DataFrameMapper(BaseEstimator, TransformerMixin):
     """
-    Map Pandas data frame column subsets to their own
-    sklearn transformation.
+    Map pandas DataFrame column subsets via sklearn transforms to feature
+    arrays.
+
+    Parameters
+    ----------
+        features : list of tuples of the form (column_selector, transform)
+            A column selector may be a string (for selecting a single column
+            as a 1-d array) or a list of string (for selecting one or more
+            columns as a 2-d array).
+            A transform is an object which supports sklearns' transform
+            interface, or a list of such objects.
+
+        sparse : bool, optional (default=False)
+            Return a sparse matrix if set True and any of the extracted
+            features are sparse.
+
+    Attributes
+    ----------
+        feature_indices_ : array of shape (len(self.features) + 1,)
+            Indices of self.features in the extracted array.
+            Feature ``i`` in self.features is mapped to features from
+            ``feature_indices_[i]`` to ``feature_indices_[i+1]`` in transformed
+            output.
     """
 
     def __init__(self, features, sparse=False):
-        """
-        Params:
-
-        features    a list of pairs. The first element is the pandas column
-                    selector. This can be a string (for one column) or a list
-                    of strings. The second element is an object that supports
-                    sklearn's transform interface, or a list of such objects.
-        sparse      will return sparse matrix if set True and any of the
-                    extracted features is sparse. Defaults to False.
-        """
         if isinstance(features, list):
             features = [(columns, _build_transformer(transformers))
                         for (columns, transformers) in features]
@@ -104,14 +115,20 @@ def transform(self, X):
         X       the data to transform
         """
         extracted = []
+        self.feature_indices_ = [0]
+
         for columns, transformers in self.features:
             # columns could be a string or list of
             # strings; we don't care because pandas
             # will handle either.
             Xt = self._get_col_subset(X, columns)
             if transformers is not None:
                 Xt = transformers.transform(Xt)
-            extracted.append(_handle_feature(Xt))
+
+            feature = _handle_feature(Xt)
+            extracted.append(feature)
+            self.feature_indices_.append(self.feature_indices_[-1] +
+                                         feature.shape[1])
 
         # combine the feature outputs into one array.
         # at this point we lose track of which features
@@ -120,7 +137,7 @@ def transform(self, X):
 
         # If any of the extracted features is sparse, combine sparsely.
         # Otherwise, combine as normal arrays.
-        if any(sparse.issparse(fea) for fea in extracted):
+        if any(sparse.issparse(feature) for feature in extracted):
             stacked = sparse.hstack(extracted).tocsr()
             # return a sparse matrix only if the mapper was initialized
             # with sparse=True

diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py
@@ -17,7 +17,7 @@
 from sklearn.pipeline import Pipeline
 from sklearn.svm import SVC
 from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.preprocessing import Imputer, StandardScaler
+from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder
 from sklearn.base import BaseEstimator, TransformerMixin
 import numpy as np
 from numpy.testing import assert_array_equal
@@ -147,6 +147,34 @@ def test_handle_feature_1dim():
     assert_array_equal(_handle_feature(array), np.array([[1], [2]]))
 
 
+def test_feature_indices_dense():
+    df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 2, 7]})
+    mapper = DataFrameMapper([
+        (['a'], OneHotEncoder()),
+        ('b', None)
+    ])
+    transformed = mapper.fit_transform(df)
+
+    indices = mapper.feature_indices_
+    assert len(indices) == len(mapper.features) + 1
+    assert (transformed[:, indices[0]:indices[1]] ==
+            OneHotEncoder(sparse=False).fit_transform(df[['a']])).all()
+    assert (transformed[:, indices[1]:indices[2]] == df[['b']].values).all()
+
+
+def test_feature_indices_sparse(simple_dataframe):
+    mapper = DataFrameMapper([
+        (['a'], OneHotEncoder())
+    ], sparse=True)
+    transformed = mapper.fit_transform(simple_dataframe)
+
+    indices = mapper.feature_indices_
+    assert len(indices) == len(mapper.features) + 1
+    # compare equality by checking that all elements in the difference are 0
+    assert (transformed[:, indices[0]:indices[1]] -
+            OneHotEncoder().fit_transform(simple_dataframe[['a']])).nnz == 0
+
+
 def test_build_transformers():
     """
     When a list of transformers is passed, return a pipeline with