Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[dask] prediction with categorical data. #7708

Merged
merged 5 commits into from
Mar 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 19 additions & 11 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,22 @@ def _check_call(ret: int) -> None:
raise XGBoostError(py_str(_LIB.XGBGetLastError()))


def _has_categorical(booster: "Booster", data: Any) -> bool:
"""Check whether the booster and input data for prediction contain categorical data.

"""
from .data import _is_pandas_df, _is_cudf_df
if _is_pandas_df(data) or _is_cudf_df(data):
ft = booster.feature_types
if ft is None:
enable_categorical = False
else:
enable_categorical = any(f == "c" for f in ft)
else:
enable_categorical = False
return enable_categorical


def build_info() -> dict:
"""Build information of XGBoost. The returned value format is not stable. Also, please
note that build time dependency is not the same as runtime dependency. For instance,
Expand Down Expand Up @@ -2046,17 +2062,9 @@ def inplace_predict(
f"got {data.shape[1]}"
)

from .data import _is_pandas_df, _transform_pandas_df
from .data import _is_pandas_df, _transform_pandas_df, _is_cudf_df
from .data import _array_interface
if (
_is_pandas_df(data)
or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
):
ft = self.feature_types
if ft is None:
enable_categorical = False
else:
enable_categorical = any(f == "c" for f in ft)
enable_categorical = _has_categorical(self, data)
if _is_pandas_df(data):
data, _, _ = _transform_pandas_df(data, enable_categorical)

Expand Down Expand Up @@ -2111,7 +2119,7 @@ def inplace_predict(
)
)
return _prediction_output(shape, dims, preds, True)
if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
if _is_cudf_df(data):
from .data import _cudf_array_interfaces, _transform_cudf_df
data, cat_codes, _, _ = _transform_cudf_df(
data, None, None, enable_categorical
Expand Down
8 changes: 6 additions & 2 deletions python-package/xgboost/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@

from .core import DMatrix, DeviceQuantileDMatrix, Booster, _expect, DataIter
from .core import Objective, Metric
from .core import _deprecate_positional_args
from .core import _deprecate_positional_args, _has_categorical
from .data import FeatNamesT
from .training import train as worker_train
from .tracker import RabitTracker, get_host_ip
Expand Down Expand Up @@ -1241,7 +1241,11 @@ def mapped_predict(
booster: Booster, partition: Any, is_df: bool, columns: List[int], _: Any
) -> Any:
with config.config_context(**global_config):
m = DMatrix(data=partition, missing=missing)
m = DMatrix(
data=partition,
missing=missing,
enable_categorical=_has_categorical(booster, partition)
)
predt = booster.predict(
data=m,
output_margin=output_margin,
Expand Down
8 changes: 2 additions & 6 deletions python-package/xgboost/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,12 +466,8 @@ def _from_dt_df(
return handle, feature_names, feature_types


def _is_cudf_df(data):
try:
import cudf
except ImportError:
return False
return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)
def _is_cudf_df(data) -> bool:
return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")


def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
Expand Down
13 changes: 13 additions & 0 deletions tests/python/test_with_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,10 +288,23 @@ def check_model_output(model: xgb.dask.Booster) -> None:
reg.fit(X, y, eval_set=[(X, y)])
assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])

booster = reg.get_booster()
predt = xgb.dask.predict(client, booster, X).compute().values
inpredt = xgb.dask.inplace_predict(client, booster, X).compute().values

if hasattr(predt, "get"):
predt = predt.get()
if hasattr(inpredt, "get"):
inpredt = inpredt.get()

np.testing.assert_allclose(predt, inpredt)


def test_categorical(client: "Client") -> None:
X, y = make_categorical(client, 10000, 30, 13)
X_onehot, _ = make_categorical(client, 10000, 30, 13, True)
run_categorical(client, "approx", X, X_onehot, y)
run_categorical(client, "hist", X, X_onehot, y)


def test_dask_predict_shape_infer(client: "Client") -> None:
Expand Down