Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ops/script/lint_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ class LintersPaths:
"tests/python/test_interaction_constraints.py",
"tests/python-gpu/test_gpu_callbacks.py",
"tests/python-gpu/test_gpu_data_iterator.py",
"tests/python-gpu/test_gpu_ordinal.py",
"tests/python-gpu/load_pickle.py",
"tests/python-gpu/test_gpu_training_continuation.py",
"tests/python-gpu/test_gpu_plotting.py",
Expand Down
4 changes: 3 additions & 1 deletion python-package/xgboost/callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
Optional,
Sequence,
Tuple,
TypeAlias,
TypeVar,
Union,
cast,
Expand Down Expand Up @@ -54,7 +55,8 @@ class TrainingCallback(ABC):

"""

EvalsLog = Dict[str, Dict[str, _ScoreList]] # pylint: disable=invalid-name
# pylint: disable=invalid-name
EvalsLog: TypeAlias = Dict[str, Dict[str, _ScoreList]]

def __init__(self) -> None:
pass
Expand Down
4 changes: 2 additions & 2 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1289,8 +1289,8 @@ def get_quantile_cut(self) -> Tuple[np.ndarray, np.ndarray]:
return indptr, data

def get_categories(self) -> Optional[Dict[str, "pa.DictionaryArray"]]:
"""Get the categories in the dataset. Return `None` if there's no categorical
features.
"""Get the categories in the dataset using `pyarrow`. Returns `None` if there's
no categorical features.

.. warning::

Expand Down
130 changes: 86 additions & 44 deletions python-package/xgboost/testing/ordinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import numpy as np
import pytest

from ..callback import TrainingCallback
from ..compat import import_cupy
from ..core import DMatrix, ExtMemQuantileDMatrix, QuantileDMatrix
from ..data import _lazy_load_cudf_is_cat
Expand Down Expand Up @@ -429,16 +430,70 @@ def run_cat_leaf(device: Literal["cpu", "cuda"]) -> None:
)


# pylint: disable=too-many-locals
def make_recoded(device: Literal["cpu", "cuda"]) -> Tuple:
"""Synthesize a test dataset with changed encoding."""
Df, _ = get_df_impl(device)

import pandas as pd

# Test large column numbers. XGBoost makes some specializations for slim datasets,
# make sure we cover all the cases.
n_features = 4096
n_samples = 1024

# Same between old and new, with 0 ("a") and 1 ("b") exchanged their position.
old_cats = ["a", "b", "c", "d"]
new_cats = ["b", "a", "c", "d"]
mapping = {0: 1, 1: 0}

rng = np.random.default_rng(2025)

col_numeric = rng.uniform(0, 1, size=(n_samples, n_features // 2))
col_categorical = rng.integers(
low=0, high=4, size=(n_samples, n_features // 2), dtype=np.int32
)

df = {} # avoid fragmentation warning from pandas
for c in range(n_features):
if c % 2 == 0:
col = col_numeric[:, c // 2]
else:
codes = col_categorical[:, c // 2]
col = pd.Categorical.from_codes(
categories=old_cats,
codes=codes,
)
df[f"f{c}"] = col

enc = Df(df)
y = rng.normal(size=n_samples)

reenc = enc.copy()
for c in range(n_features):
if c % 2 == 0:
continue

name = f"f{c}"
codes_ser = reenc[name].cat.codes
if hasattr(codes_ser, "to_pandas"): # cudf
codes_ser = codes_ser.to_pandas()
new_codes = codes_ser.replace(mapping)
reenc[name] = pd.Categorical.from_codes(categories=new_cats, codes=new_codes)
reenc = Df(reenc)
assert (reenc.iloc[:, 1].cat.codes != enc.iloc[:, 1].cat.codes).any()
return enc, reenc, y, col_numeric, col_categorical


def run_specified_cat( # pylint: disable=too-many-locals
device: Literal["cpu", "cuda"],
) -> None:
"""Run with manually specified category encoding."""
import pandas as pd

# Same between old and new, wiht 0 ("a") and 1 ("b") exchanged their position.
# Same between old and new, with 0 ("a") and 1 ("b") exchanged their position.
old_cats = ["a", "b", "c", "d"]
new_cats = ["b", "a", "c", "d"]
mapping = {0: 1, 1: 0}

col0 = np.arange(0, 9)
col1 = pd.Categorical.from_codes(
Expand Down Expand Up @@ -468,57 +523,23 @@ def run_specified_cat( # pylint: disable=too-many-locals
predt2 = booster.inplace_predict(df1)
assert_allclose(device, predt0, predt2)

# Test large column numbers. XGBoost makes some specializations for slim datasets,
# make sure we cover all the cases.
n_features = 4096
n_samples = 1024

col_numeric = rng.uniform(0, 1, size=(n_samples, n_features // 2))
col_categorical = rng.integers(
low=0, high=4, size=(n_samples, n_features // 2), dtype=np.int32
)

df = {} # avoid fragmentation warning from pandas
for c in range(n_features):
if c % 2 == 0:
col = col_numeric[:, c // 2]
else:
codes = col_categorical[:, c // 2]
col = pd.Categorical.from_codes(
categories=old_cats,
codes=codes,
)
df[f"f{c}"] = col
enc, reenc, y, col_numeric, col_categorical = make_recoded(device)

df = Df(df)
y = rng.normal(size=n_samples)

Xy = DMatrix(df, y, enable_categorical=True)
Xy = DMatrix(enc, y, enable_categorical=True)
booster = train({"device": device}, Xy)

predt0 = booster.predict(Xy)
predt1 = booster.inplace_predict(df)
predt1 = booster.inplace_predict(enc)
assert_allclose(device, predt0, predt1)

for c in range(n_features):
if c % 2 == 0:
continue

name = f"f{c}"
codes_ser = df[name].cat.codes
if hasattr(codes_ser, "to_pandas"): # cudf
codes_ser = codes_ser.to_pandas()
new_codes = codes_ser.replace(mapping)
df[name] = pd.Categorical.from_codes(categories=new_cats, codes=new_codes)

df = Df(df)
Xy = DMatrix(df, y, enable_categorical=True)
Xy = DMatrix(reenc, y, enable_categorical=True)
predt2 = booster.predict(Xy)
assert_allclose(device, predt0, predt2)

array = np.empty(shape=(n_samples, n_features))
array[:, np.arange(0, n_features) % 2 == 0] = col_numeric
array[:, np.arange(0, n_features) % 2 != 0] = col_categorical
array = np.empty(shape=(reenc.shape[0], reenc.shape[1]))

array[:, enc.dtypes == "category"] = col_categorical
array[:, enc.dtypes != "category"] = col_numeric

if device == "cuda":
import cupy as cp
Expand All @@ -527,3 +548,24 @@ def run_specified_cat( # pylint: disable=too-many-locals

predt3 = booster.inplace_predict(array)
assert_allclose(device, predt0, predt3)


def run_validation(device: Literal["cpu", "cuda"]) -> None:
"""CHeck the validation dataset is using the correct encoding."""
enc, reenc, y, _, _ = make_recoded(device)

Xy = DMatrix(enc, y, enable_categorical=True)
Xy_valid = DMatrix(reenc, y, enable_categorical=True)

evals_result: TrainingCallback.EvalsLog = {}
train(
{"device": device},
Xy,
evals=[(Xy, "Train"), (Xy_valid, "Valid")],
evals_result=evals_result,
)

# Evaluation dataset should have the exact same performance as the training dataset.
assert_allclose(
device, evals_result["Train"]["rmse"], evals_result["Valid"]["rmse"]
)
2 changes: 1 addition & 1 deletion src/data/adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,7 @@ template <typename CategoricalIndex, bool allow_mask>
/**
* @brief Adapter for columnar format (arrow).
*
* Supports for both numeric values and categorical values.
* Supports both numeric values and categorical values.
*/
class ColumnarAdapter : public detail::SingleBatchDataIter<ColumnarAdapterBatch> {
std::vector<ArrayInterface<1>> columns_;
Expand Down
9 changes: 7 additions & 2 deletions tests/python-gpu/test_gpu_ordinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
run_cat_shap,
run_cat_thread_safety,
run_specified_cat,
run_validation,
)

pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_arrow(), tm.no_cudf()))
Expand Down Expand Up @@ -60,7 +61,7 @@ def test_mixed_devices() -> None:
n_features = 4
X, y = make_categorical(n_samples, n_features, 7, onehot=False, device="cpu")

def run_cpu_gpu(DMatrixT: Type):
def run_cpu_gpu(DMatrixT: Type) -> bool:
Xy = DMatrixT(X, y, enable_categorical=True)
booster = xgb.train({"tree_method": "hist", "device": "cuda"}, Xy)
predt0 = booster.inplace_predict(X)
Expand All @@ -83,7 +84,7 @@ def run_cpu_gpu(DMatrixT: Type):

X, y = make_categorical(n_samples, n_features, 7, onehot=False, device="cuda")

def run_gpu_cpu(DMatrixT: Type):
def run_gpu_cpu(DMatrixT: Type) -> bool:
Xy = DMatrixT(X, y, enable_categorical=True)
booster = xgb.train({"tree_method": "hist", "device": "cpu"}, Xy)
predt0 = booster.inplace_predict(X).get()
Expand All @@ -104,3 +105,7 @@ def run_gpu_cpu(DMatrixT: Type):

def test_spcified_cat() -> None:
run_specified_cat("cuda")


def test_validation() -> None:
run_validation("cuda")
5 changes: 5 additions & 0 deletions tests/python/test_ordinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
run_cat_shap,
run_cat_thread_safety,
run_specified_cat,
run_validation,
)

pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_arrow(), tm.no_pandas()))
Expand Down Expand Up @@ -50,3 +51,7 @@ def test_cat_leaf() -> None:

def test_spcified_cat() -> None:
run_specified_cat("cpu")


def test_validation() -> None:
run_validation("cpu")
Loading