Skip to content

Commit 70e47c0

Browse files
authored
[enc] Add tests for re-coding validation datasets. (#11561)
1 parent ad29598 commit 70e47c0

File tree

7 files changed

+105
-50
lines changed

7 files changed

+105
-50
lines changed

ops/script/lint_python.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ class LintersPaths:
108108
"tests/python/test_interaction_constraints.py",
109109
"tests/python-gpu/test_gpu_callbacks.py",
110110
"tests/python-gpu/test_gpu_data_iterator.py",
111+
"tests/python-gpu/test_gpu_ordinal.py",
111112
"tests/python-gpu/load_pickle.py",
112113
"tests/python-gpu/test_gpu_training_continuation.py",
113114
"tests/python-gpu/test_gpu_plotting.py",

python-package/xgboost/callback.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
Optional,
1616
Sequence,
1717
Tuple,
18+
TypeAlias,
1819
TypeVar,
1920
Union,
2021
cast,
@@ -54,7 +55,8 @@ class TrainingCallback(ABC):
5455
5556
"""
5657

57-
EvalsLog = Dict[str, Dict[str, _ScoreList]] # pylint: disable=invalid-name
58+
# pylint: disable=invalid-name
59+
EvalsLog: TypeAlias = Dict[str, Dict[str, _ScoreList]]
5860

5961
def __init__(self) -> None:
6062
pass

python-package/xgboost/core.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1289,8 +1289,8 @@ def get_quantile_cut(self) -> Tuple[np.ndarray, np.ndarray]:
12891289
return indptr, data
12901290

12911291
def get_categories(self) -> Optional[Dict[str, "pa.DictionaryArray"]]:
1292-
"""Get the categories in the dataset. Return `None` if there's no categorical
1293-
features.
1292+
"""Get the categories in the dataset using `pyarrow`. Returns `None` if there's
1293+
no categorical features.
12941294
12951295
.. warning::
12961296

python-package/xgboost/testing/ordinal.py

Lines changed: 86 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import numpy as np
1010
import pytest
1111

12+
from ..callback import TrainingCallback
1213
from ..compat import import_cupy
1314
from ..core import DMatrix, ExtMemQuantileDMatrix, QuantileDMatrix
1415
from ..data import _lazy_load_cudf_is_cat
@@ -429,16 +430,70 @@ def run_cat_leaf(device: Literal["cpu", "cuda"]) -> None:
429430
)
430431

431432

433+
# pylint: disable=too-many-locals
434+
def make_recoded(device: Literal["cpu", "cuda"]) -> Tuple:
435+
"""Synthesize a test dataset with changed encoding."""
436+
Df, _ = get_df_impl(device)
437+
438+
import pandas as pd
439+
440+
# Test large column numbers. XGBoost makes some specializations for slim datasets,
441+
# make sure we cover all the cases.
442+
n_features = 4096
443+
n_samples = 1024
444+
445+
# Same between old and new, with 0 ("a") and 1 ("b") exchanged their position.
446+
old_cats = ["a", "b", "c", "d"]
447+
new_cats = ["b", "a", "c", "d"]
448+
mapping = {0: 1, 1: 0}
449+
450+
rng = np.random.default_rng(2025)
451+
452+
col_numeric = rng.uniform(0, 1, size=(n_samples, n_features // 2))
453+
col_categorical = rng.integers(
454+
low=0, high=4, size=(n_samples, n_features // 2), dtype=np.int32
455+
)
456+
457+
df = {} # avoid fragmentation warning from pandas
458+
for c in range(n_features):
459+
if c % 2 == 0:
460+
col = col_numeric[:, c // 2]
461+
else:
462+
codes = col_categorical[:, c // 2]
463+
col = pd.Categorical.from_codes(
464+
categories=old_cats,
465+
codes=codes,
466+
)
467+
df[f"f{c}"] = col
468+
469+
enc = Df(df)
470+
y = rng.normal(size=n_samples)
471+
472+
reenc = enc.copy()
473+
for c in range(n_features):
474+
if c % 2 == 0:
475+
continue
476+
477+
name = f"f{c}"
478+
codes_ser = reenc[name].cat.codes
479+
if hasattr(codes_ser, "to_pandas"): # cudf
480+
codes_ser = codes_ser.to_pandas()
481+
new_codes = codes_ser.replace(mapping)
482+
reenc[name] = pd.Categorical.from_codes(categories=new_cats, codes=new_codes)
483+
reenc = Df(reenc)
484+
assert (reenc.iloc[:, 1].cat.codes != enc.iloc[:, 1].cat.codes).any()
485+
return enc, reenc, y, col_numeric, col_categorical
486+
487+
432488
def run_specified_cat( # pylint: disable=too-many-locals
433489
device: Literal["cpu", "cuda"],
434490
) -> None:
435491
"""Run with manually specified category encoding."""
436492
import pandas as pd
437493

438-
# Same between old and new, wiht 0 ("a") and 1 ("b") exchanged their position.
494+
# Same between old and new, with 0 ("a") and 1 ("b") exchanged their position.
439495
old_cats = ["a", "b", "c", "d"]
440496
new_cats = ["b", "a", "c", "d"]
441-
mapping = {0: 1, 1: 0}
442497

443498
col0 = np.arange(0, 9)
444499
col1 = pd.Categorical.from_codes(
@@ -468,57 +523,23 @@ def run_specified_cat( # pylint: disable=too-many-locals
468523
predt2 = booster.inplace_predict(df1)
469524
assert_allclose(device, predt0, predt2)
470525

471-
# Test large column numbers. XGBoost makes some specializations for slim datasets,
472-
# make sure we cover all the cases.
473-
n_features = 4096
474-
n_samples = 1024
475-
476-
col_numeric = rng.uniform(0, 1, size=(n_samples, n_features // 2))
477-
col_categorical = rng.integers(
478-
low=0, high=4, size=(n_samples, n_features // 2), dtype=np.int32
479-
)
480-
481-
df = {} # avoid fragmentation warning from pandas
482-
for c in range(n_features):
483-
if c % 2 == 0:
484-
col = col_numeric[:, c // 2]
485-
else:
486-
codes = col_categorical[:, c // 2]
487-
col = pd.Categorical.from_codes(
488-
categories=old_cats,
489-
codes=codes,
490-
)
491-
df[f"f{c}"] = col
526+
enc, reenc, y, col_numeric, col_categorical = make_recoded(device)
492527

493-
df = Df(df)
494-
y = rng.normal(size=n_samples)
495-
496-
Xy = DMatrix(df, y, enable_categorical=True)
528+
Xy = DMatrix(enc, y, enable_categorical=True)
497529
booster = train({"device": device}, Xy)
498530

499531
predt0 = booster.predict(Xy)
500-
predt1 = booster.inplace_predict(df)
532+
predt1 = booster.inplace_predict(enc)
501533
assert_allclose(device, predt0, predt1)
502534

503-
for c in range(n_features):
504-
if c % 2 == 0:
505-
continue
506-
507-
name = f"f{c}"
508-
codes_ser = df[name].cat.codes
509-
if hasattr(codes_ser, "to_pandas"): # cudf
510-
codes_ser = codes_ser.to_pandas()
511-
new_codes = codes_ser.replace(mapping)
512-
df[name] = pd.Categorical.from_codes(categories=new_cats, codes=new_codes)
513-
514-
df = Df(df)
515-
Xy = DMatrix(df, y, enable_categorical=True)
535+
Xy = DMatrix(reenc, y, enable_categorical=True)
516536
predt2 = booster.predict(Xy)
517537
assert_allclose(device, predt0, predt2)
518538

519-
array = np.empty(shape=(n_samples, n_features))
520-
array[:, np.arange(0, n_features) % 2 == 0] = col_numeric
521-
array[:, np.arange(0, n_features) % 2 != 0] = col_categorical
539+
array = np.empty(shape=(reenc.shape[0], reenc.shape[1]))
540+
541+
array[:, enc.dtypes == "category"] = col_categorical
542+
array[:, enc.dtypes != "category"] = col_numeric
522543

523544
if device == "cuda":
524545
import cupy as cp
@@ -527,3 +548,24 @@ def run_specified_cat( # pylint: disable=too-many-locals
527548

528549
predt3 = booster.inplace_predict(array)
529550
assert_allclose(device, predt0, predt3)
551+
552+
553+
def run_validation(device: Literal["cpu", "cuda"]) -> None:
554+
"""CHeck the validation dataset is using the correct encoding."""
555+
enc, reenc, y, _, _ = make_recoded(device)
556+
557+
Xy = DMatrix(enc, y, enable_categorical=True)
558+
Xy_valid = DMatrix(reenc, y, enable_categorical=True)
559+
560+
evals_result: TrainingCallback.EvalsLog = {}
561+
train(
562+
{"device": device},
563+
Xy,
564+
evals=[(Xy, "Train"), (Xy_valid, "Valid")],
565+
evals_result=evals_result,
566+
)
567+
568+
# Evaluation dataset should have the exact same performance as the training dataset.
569+
assert_allclose(
570+
device, evals_result["Train"]["rmse"], evals_result["Valid"]["rmse"]
571+
)

src/data/adapter.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -525,7 +525,7 @@ template <typename CategoricalIndex, bool allow_mask>
525525
/**
526526
* @brief Adapter for columnar format (arrow).
527527
*
528-
* Supports for both numeric values and categorical values.
528+
* Supports both numeric values and categorical values.
529529
*/
530530
class ColumnarAdapter : public detail::SingleBatchDataIter<ColumnarAdapterBatch> {
531531
std::vector<ArrayInterface<1>> columns_;

tests/python-gpu/test_gpu_ordinal.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
run_cat_shap,
1919
run_cat_thread_safety,
2020
run_specified_cat,
21+
run_validation,
2122
)
2223

2324
pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_arrow(), tm.no_cudf()))
@@ -60,7 +61,7 @@ def test_mixed_devices() -> None:
6061
n_features = 4
6162
X, y = make_categorical(n_samples, n_features, 7, onehot=False, device="cpu")
6263

63-
def run_cpu_gpu(DMatrixT: Type):
64+
def run_cpu_gpu(DMatrixT: Type) -> bool:
6465
Xy = DMatrixT(X, y, enable_categorical=True)
6566
booster = xgb.train({"tree_method": "hist", "device": "cuda"}, Xy)
6667
predt0 = booster.inplace_predict(X)
@@ -83,7 +84,7 @@ def run_cpu_gpu(DMatrixT: Type):
8384

8485
X, y = make_categorical(n_samples, n_features, 7, onehot=False, device="cuda")
8586

86-
def run_gpu_cpu(DMatrixT: Type):
87+
def run_gpu_cpu(DMatrixT: Type) -> bool:
8788
Xy = DMatrixT(X, y, enable_categorical=True)
8889
booster = xgb.train({"tree_method": "hist", "device": "cpu"}, Xy)
8990
predt0 = booster.inplace_predict(X).get()
@@ -104,3 +105,7 @@ def run_gpu_cpu(DMatrixT: Type):
104105

105106
def test_spcified_cat() -> None:
106107
run_specified_cat("cuda")
108+
109+
110+
def test_validation() -> None:
111+
run_validation("cuda")

tests/python/test_ordinal.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
run_cat_shap,
1212
run_cat_thread_safety,
1313
run_specified_cat,
14+
run_validation,
1415
)
1516

1617
pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_arrow(), tm.no_pandas()))
@@ -50,3 +51,7 @@ def test_cat_leaf() -> None:
5051

5152
def test_spcified_cat() -> None:
5253
run_specified_cat("cpu")
54+
55+
56+
def test_validation() -> None:
57+
run_validation("cpu")

0 commit comments

Comments
 (0)