Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support pandas SparseArray. #5431

Merged
merged 2 commits into from
Mar 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,12 +246,13 @@ def _has_cuda_array_interface(data):
def _maybe_pandas_data(data, feature_names, feature_types,
meta=None, meta_type=None):
"""Extract internal data from pd.DataFrame for DMatrix data"""

if not (PANDAS_INSTALLED and isinstance(data, DataFrame)):
return data, feature_names, feature_types
from pandas.api.types import is_sparse

data_dtypes = data.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
if not all(dtype.name in PANDAS_DTYPE_MAPPER or is_sparse(dtype)
for dtype in data_dtypes):
bad_fields = [
str(data.columns[i]) for i, dtype in enumerate(data_dtypes)
if dtype.name not in PANDAS_DTYPE_MAPPER
Expand All @@ -272,9 +273,12 @@ def _maybe_pandas_data(data, feature_names, feature_types,
feature_names = data.columns.format()

if feature_types is None and meta is None:
feature_types = [
PANDAS_DTYPE_MAPPER[dtype.name] for dtype in data_dtypes
]
feature_types = []
for dtype in data_dtypes:
if is_sparse(dtype):
feature_types.append(PANDAS_DTYPE_MAPPER[dtype.subtype.name])
else:
feature_types.append(PANDAS_DTYPE_MAPPER[dtype.name])

if meta and len(data.columns) > 1:
raise ValueError(
Expand Down
16 changes: 16 additions & 0 deletions tests/python/test_with_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,22 @@ def test_pandas(self):
assert dm.num_row() == 2
assert dm.num_col() == 6

def test_pandas_sparse(self):
import pandas as pd
rows = 100
X = pd.DataFrame(
{"A": pd.SparseArray(np.random.randint(0, 10, size=rows)),
"B": pd.SparseArray(np.random.randn(rows)),
"C": pd.SparseArray(np.random.permutation(
[True, False] * (rows // 2)))}
)
y = pd.Series(pd.SparseArray(np.random.randn(rows)))
dtrain = xgb.DMatrix(X, y)
booster = xgb.train({}, dtrain, num_boost_round=4)
predt_sparse = booster.predict(xgb.DMatrix(X))
predt_dense = booster.predict(xgb.DMatrix(X.sparse.to_dense()))
np.testing.assert_allclose(predt_sparse, predt_dense)

def test_pandas_label(self):
# label must be a single column
df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
Expand Down