diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 856da41c156d..e27876ef9169 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -246,12 +246,13 @@ def _has_cuda_array_interface(data): def _maybe_pandas_data(data, feature_names, feature_types, meta=None, meta_type=None): """Extract internal data from pd.DataFrame for DMatrix data""" - if not (PANDAS_INSTALLED and isinstance(data, DataFrame)): return data, feature_names, feature_types + from pandas.api.types import is_sparse data_dtypes = data.dtypes - if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes): + if not all(dtype.name in PANDAS_DTYPE_MAPPER or is_sparse(dtype) + for dtype in data_dtypes): bad_fields = [ str(data.columns[i]) for i, dtype in enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER @@ -272,9 +273,12 @@ def _maybe_pandas_data(data, feature_names, feature_types, feature_names = data.columns.format() if feature_types is None and meta is None: - feature_types = [ - PANDAS_DTYPE_MAPPER[dtype.name] for dtype in data_dtypes - ] + feature_types = [] + for dtype in data_dtypes: + if is_sparse(dtype): + feature_types.append(PANDAS_DTYPE_MAPPER[dtype.subtype.name]) + else: + feature_types.append(PANDAS_DTYPE_MAPPER[dtype.name]) if meta and len(data.columns) > 1: raise ValueError( diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py index 4da606269e1c..4d20ee72ca42 100644 --- a/tests/python/test_with_pandas.py +++ b/tests/python/test_with_pandas.py @@ -109,6 +109,22 @@ def test_pandas(self): assert dm.num_row() == 2 assert dm.num_col() == 6 + def test_pandas_sparse(self): + import pandas as pd + rows = 100 + X = pd.DataFrame( + {"A": pd.SparseArray(np.random.randint(0, 10, size=rows)), + "B": pd.SparseArray(np.random.randn(rows)), + "C": pd.SparseArray(np.random.permutation( + [True, False] * (rows // 2)))} + ) + y = pd.Series(pd.SparseArray(np.random.randn(rows))) + dtrain = xgb.DMatrix(X, y) + booster = xgb.train({}, dtrain, num_boost_round=4) + predt_sparse = booster.predict(xgb.DMatrix(X)) + predt_dense = booster.predict(xgb.DMatrix(X.sparse.to_dense())) + np.testing.assert_allclose(predt_sparse, predt_dense) + def test_pandas_label(self): # label must be a single column df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})