Skip to content

Add feature_indices_ attribute to mapper #56

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,17 @@ Now that the transformation is trained, we confirm that it works on new data::
>>> np.round(mapper.transform(sample), 2)
array([[ 1. , 0. , 0. , 1.04]])

After transformation, the ``feature_indices_` attribute of the mapper
indicates which columns of the resulting output array correspond to which
input features. Input feature ``i`` is mapped to features from
``feature_indices_[i]`` to ``feature_indices_[i+1]`` in transformed output.
For example:

>>> mapper.feature_indices_[0], mapper.feature_indices_[1] # pet
(0, 3)
>>> mapper.feature_indices_[1], mapper.feature_indices_[2] # children
(3, 4)

Transform Multiple Columns
**************************

Expand Down Expand Up @@ -195,6 +206,8 @@ Development

* Deprecate custom cross-validation shim classes.
* Require ``scikit-learn>=0.15.0``. Resolves #49.
* Add ``feature_indices_`` attribute indicating the mapping between input and
ouptut variables.


1.1.0 (2015-12-06)
Expand Down
45 changes: 31 additions & 14 deletions sklearn_pandas/dataframe_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,32 @@ def _build_transformer(transformers):

class DataFrameMapper(BaseEstimator, TransformerMixin):
"""
Map Pandas data frame column subsets to their own
sklearn transformation.
Map pandas DataFrame column subsets via sklearn transforms to feature
arrays.

Parameters
----------
features : list of tuples of the form (column_selector, transform)
A column selector may be a string (for selecting a single column
as a 1-d array) or a list of string (for selecting one or more
columns as a 2-d array).
A transform is an object which supports sklearns' transform
interface, or a list of such objects.

sparse : bool, optional (default=False)
Return a sparse matrix if set True and any of the extracted
features are sparse.

Attributes
----------
feature_indices_ : array of shape (len(self.features) + 1,)
Indices of self.features in the extracted array.
Feature ``i`` in self.features is mapped to features from
``feature_indices_[i]`` to ``feature_indices_[i+1]`` in transformed
output.
"""

def __init__(self, features, sparse=False):
"""
Params:

features a list of pairs. The first element is the pandas column
selector. This can be a string (for one column) or a list
of strings. The second element is an object that supports
sklearn's transform interface, or a list of such objects.
sparse will return sparse matrix if set True and any of the
extracted features is sparse. Defaults to False.
"""
if isinstance(features, list):
features = [(columns, _build_transformer(transformers))
for (columns, transformers) in features]
Expand Down Expand Up @@ -104,14 +115,20 @@ def transform(self, X):
X the data to transform
"""
extracted = []
self.feature_indices_ = [0]

for columns, transformers in self.features:
# columns could be a string or list of
# strings; we don't care because pandas
# will handle either.
Xt = self._get_col_subset(X, columns)
if transformers is not None:
Xt = transformers.transform(Xt)
extracted.append(_handle_feature(Xt))

feature = _handle_feature(Xt)
extracted.append(feature)
self.feature_indices_.append(self.feature_indices_[-1] +
feature.shape[1])

# combine the feature outputs into one array.
# at this point we lose track of which features
Expand All @@ -120,7 +137,7 @@ def transform(self, X):

# If any of the extracted features is sparse, combine sparsely.
# Otherwise, combine as normal arrays.
if any(sparse.issparse(fea) for fea in extracted):
if any(sparse.issparse(feature) for feature in extracted):
stacked = sparse.hstack(extracted).tocsr()
# return a sparse matrix only if the mapper was initialized
# with sparse=True
Expand Down
30 changes: 29 additions & 1 deletion tests/test_dataframe_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from numpy.testing import assert_array_equal
Expand Down Expand Up @@ -147,6 +147,34 @@ def test_handle_feature_1dim():
assert_array_equal(_handle_feature(array), np.array([[1], [2]]))


def test_feature_indices_dense():
df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 2, 7]})
mapper = DataFrameMapper([
(['a'], OneHotEncoder()),
('b', None)
])
transformed = mapper.fit_transform(df)

indices = mapper.feature_indices_
assert len(indices) == len(mapper.features) + 1
assert (transformed[:, indices[0]:indices[1]] ==
OneHotEncoder(sparse=False).fit_transform(df[['a']])).all()
assert (transformed[:, indices[1]:indices[2]] == df[['b']].values).all()


def test_feature_indices_sparse(simple_dataframe):
mapper = DataFrameMapper([
(['a'], OneHotEncoder())
], sparse=True)
transformed = mapper.fit_transform(simple_dataframe)

indices = mapper.feature_indices_
assert len(indices) == len(mapper.features) + 1
# compare equality by checking that all elements in the difference are 0
assert (transformed[:, indices[0]:indices[1]] -
OneHotEncoder().fit_transform(simple_dataframe[['a']])).nnz == 0


def test_build_transformers():
"""
When a list of transformers is passed, return a pipeline with
Expand Down