Skip to content

Commit

Permalink
[dask] prediction with categorical data. (#7708)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis authored Mar 9, 2022
1 parent 68b6d6b commit a62a3d9
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 19 deletions.
30 changes: 19 additions & 11 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,22 @@ def _check_call(ret: int) -> None:
raise XGBoostError(py_str(_LIB.XGBGetLastError()))


def _has_categorical(booster: "Booster", data: Any) -> bool:
"""Check whether the booster and input data for prediction contain categorical data.
"""
from .data import _is_pandas_df, _is_cudf_df
if _is_pandas_df(data) or _is_cudf_df(data):
ft = booster.feature_types
if ft is None:
enable_categorical = False
else:
enable_categorical = any(f == "c" for f in ft)
else:
enable_categorical = False
return enable_categorical


def build_info() -> dict:
"""Build information of XGBoost. The returned value format is not stable. Also, please
note that build time dependency is not the same as runtime dependency. For instance,
Expand Down Expand Up @@ -2046,17 +2062,9 @@ def inplace_predict(
f"got {data.shape[1]}"
)

from .data import _is_pandas_df, _transform_pandas_df
from .data import _is_pandas_df, _transform_pandas_df, _is_cudf_df
from .data import _array_interface
if (
_is_pandas_df(data)
or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
):
ft = self.feature_types
if ft is None:
enable_categorical = False
else:
enable_categorical = any(f == "c" for f in ft)
enable_categorical = _has_categorical(self, data)
if _is_pandas_df(data):
data, _, _ = _transform_pandas_df(data, enable_categorical)

Expand Down Expand Up @@ -2111,7 +2119,7 @@ def inplace_predict(
)
)
return _prediction_output(shape, dims, preds, True)
if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
if _is_cudf_df(data):
from .data import _cudf_array_interfaces, _transform_cudf_df
data, cat_codes, _, _ = _transform_cudf_df(
data, None, None, enable_categorical
Expand Down
8 changes: 6 additions & 2 deletions python-package/xgboost/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@

from .core import DMatrix, DeviceQuantileDMatrix, Booster, _expect, DataIter
from .core import Objective, Metric
from .core import _deprecate_positional_args
from .core import _deprecate_positional_args, _has_categorical
from .data import FeatNamesT
from .training import train as worker_train
from .tracker import RabitTracker, get_host_ip
Expand Down Expand Up @@ -1241,7 +1241,11 @@ def mapped_predict(
booster: Booster, partition: Any, is_df: bool, columns: List[int], _: Any
) -> Any:
with config.config_context(**global_config):
m = DMatrix(data=partition, missing=missing)
m = DMatrix(
data=partition,
missing=missing,
enable_categorical=_has_categorical(booster, partition)
)
predt = booster.predict(
data=m,
output_margin=output_margin,
Expand Down
8 changes: 2 additions & 6 deletions python-package/xgboost/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,12 +466,8 @@ def _from_dt_df(
return handle, feature_names, feature_types


def _is_cudf_df(data):
try:
import cudf
except ImportError:
return False
return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)
def _is_cudf_df(data) -> bool:
return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")


def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
Expand Down
13 changes: 13 additions & 0 deletions tests/python/test_with_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,10 +288,23 @@ def check_model_output(model: xgb.dask.Booster) -> None:
reg.fit(X, y, eval_set=[(X, y)])
assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])

booster = reg.get_booster()
predt = xgb.dask.predict(client, booster, X).compute().values
inpredt = xgb.dask.inplace_predict(client, booster, X).compute().values

if hasattr(predt, "get"):
predt = predt.get()
if hasattr(inpredt, "get"):
inpredt = inpredt.get()

np.testing.assert_allclose(predt, inpredt)


def test_categorical(client: "Client") -> None:
X, y = make_categorical(client, 10000, 30, 13)
X_onehot, _ = make_categorical(client, 10000, 30, 13, True)
run_categorical(client, "approx", X, X_onehot, y)
run_categorical(client, "hist", X, X_onehot, y)


def test_dask_predict_shape_infer(client: "Client") -> None:
Expand Down

0 comments on commit a62a3d9

Please sign in to comment.