99import numpy as np
1010import pytest
1111
12+ from ..callback import TrainingCallback
1213from ..compat import import_cupy
1314from ..core import DMatrix , ExtMemQuantileDMatrix , QuantileDMatrix
1415from ..data import _lazy_load_cudf_is_cat
@@ -429,16 +430,70 @@ def run_cat_leaf(device: Literal["cpu", "cuda"]) -> None:
429430 )
430431
431432
433+ # pylint: disable=too-many-locals
434+ def make_recoded (device : Literal ["cpu" , "cuda" ]) -> Tuple :
435+ """Synthesize a test dataset with changed encoding."""
436+ Df , _ = get_df_impl (device )
437+
438+ import pandas as pd
439+
440+ # Test large column numbers. XGBoost makes some specializations for slim datasets,
441+ # make sure we cover all the cases.
442+ n_features = 4096
443+ n_samples = 1024
444+
445+ # Same between old and new, with 0 ("a") and 1 ("b") exchanged their position.
446+ old_cats = ["a" , "b" , "c" , "d" ]
447+ new_cats = ["b" , "a" , "c" , "d" ]
448+ mapping = {0 : 1 , 1 : 0 }
449+
450+ rng = np .random .default_rng (2025 )
451+
452+ col_numeric = rng .uniform (0 , 1 , size = (n_samples , n_features // 2 ))
453+ col_categorical = rng .integers (
454+ low = 0 , high = 4 , size = (n_samples , n_features // 2 ), dtype = np .int32
455+ )
456+
457+ df = {} # avoid fragmentation warning from pandas
458+ for c in range (n_features ):
459+ if c % 2 == 0 :
460+ col = col_numeric [:, c // 2 ]
461+ else :
462+ codes = col_categorical [:, c // 2 ]
463+ col = pd .Categorical .from_codes (
464+ categories = old_cats ,
465+ codes = codes ,
466+ )
467+ df [f"f{ c } " ] = col
468+
469+ enc = Df (df )
470+ y = rng .normal (size = n_samples )
471+
472+ reenc = enc .copy ()
473+ for c in range (n_features ):
474+ if c % 2 == 0 :
475+ continue
476+
477+ name = f"f{ c } "
478+ codes_ser = reenc [name ].cat .codes
479+ if hasattr (codes_ser , "to_pandas" ): # cudf
480+ codes_ser = codes_ser .to_pandas ()
481+ new_codes = codes_ser .replace (mapping )
482+ reenc [name ] = pd .Categorical .from_codes (categories = new_cats , codes = new_codes )
483+ reenc = Df (reenc )
484+ assert (reenc .iloc [:, 1 ].cat .codes != enc .iloc [:, 1 ].cat .codes ).any ()
485+ return enc , reenc , y , col_numeric , col_categorical
486+
487+
432488def run_specified_cat ( # pylint: disable=too-many-locals
433489 device : Literal ["cpu" , "cuda" ],
434490) -> None :
435491 """Run with manually specified category encoding."""
436492 import pandas as pd
437493
438- # Same between old and new, wiht 0 ("a") and 1 ("b") exchanged their position.
494+ # Same between old and new, with 0 ("a") and 1 ("b") exchanged their position.
439495 old_cats = ["a" , "b" , "c" , "d" ]
440496 new_cats = ["b" , "a" , "c" , "d" ]
441- mapping = {0 : 1 , 1 : 0 }
442497
443498 col0 = np .arange (0 , 9 )
444499 col1 = pd .Categorical .from_codes (
@@ -468,57 +523,23 @@ def run_specified_cat( # pylint: disable=too-many-locals
468523 predt2 = booster .inplace_predict (df1 )
469524 assert_allclose (device , predt0 , predt2 )
470525
471- # Test large column numbers. XGBoost makes some specializations for slim datasets,
472- # make sure we cover all the cases.
473- n_features = 4096
474- n_samples = 1024
475-
476- col_numeric = rng .uniform (0 , 1 , size = (n_samples , n_features // 2 ))
477- col_categorical = rng .integers (
478- low = 0 , high = 4 , size = (n_samples , n_features // 2 ), dtype = np .int32
479- )
480-
481- df = {} # avoid fragmentation warning from pandas
482- for c in range (n_features ):
483- if c % 2 == 0 :
484- col = col_numeric [:, c // 2 ]
485- else :
486- codes = col_categorical [:, c // 2 ]
487- col = pd .Categorical .from_codes (
488- categories = old_cats ,
489- codes = codes ,
490- )
491- df [f"f{ c } " ] = col
526+ enc , reenc , y , col_numeric , col_categorical = make_recoded (device )
492527
493- df = Df (df )
494- y = rng .normal (size = n_samples )
495-
496- Xy = DMatrix (df , y , enable_categorical = True )
528+ Xy = DMatrix (enc , y , enable_categorical = True )
497529 booster = train ({"device" : device }, Xy )
498530
499531 predt0 = booster .predict (Xy )
500- predt1 = booster .inplace_predict (df )
532+ predt1 = booster .inplace_predict (enc )
501533 assert_allclose (device , predt0 , predt1 )
502534
503- for c in range (n_features ):
504- if c % 2 == 0 :
505- continue
506-
507- name = f"f{ c } "
508- codes_ser = df [name ].cat .codes
509- if hasattr (codes_ser , "to_pandas" ): # cudf
510- codes_ser = codes_ser .to_pandas ()
511- new_codes = codes_ser .replace (mapping )
512- df [name ] = pd .Categorical .from_codes (categories = new_cats , codes = new_codes )
513-
514- df = Df (df )
515- Xy = DMatrix (df , y , enable_categorical = True )
535+ Xy = DMatrix (reenc , y , enable_categorical = True )
516536 predt2 = booster .predict (Xy )
517537 assert_allclose (device , predt0 , predt2 )
518538
519- array = np .empty (shape = (n_samples , n_features ))
520- array [:, np .arange (0 , n_features ) % 2 == 0 ] = col_numeric
521- array [:, np .arange (0 , n_features ) % 2 != 0 ] = col_categorical
539+ array = np .empty (shape = (reenc .shape [0 ], reenc .shape [1 ]))
540+
541+ array [:, enc .dtypes == "category" ] = col_categorical
542+ array [:, enc .dtypes != "category" ] = col_numeric
522543
523544 if device == "cuda" :
524545 import cupy as cp
@@ -527,3 +548,24 @@ def run_specified_cat( # pylint: disable=too-many-locals
527548
528549 predt3 = booster .inplace_predict (array )
529550 assert_allclose (device , predt0 , predt3 )
551+
552+
553+ def run_validation (device : Literal ["cpu" , "cuda" ]) -> None :
554+ """CHeck the validation dataset is using the correct encoding."""
555+ enc , reenc , y , _ , _ = make_recoded (device )
556+
557+ Xy = DMatrix (enc , y , enable_categorical = True )
558+ Xy_valid = DMatrix (reenc , y , enable_categorical = True )
559+
560+ evals_result : TrainingCallback .EvalsLog = {}
561+ train (
562+ {"device" : device },
563+ Xy ,
564+ evals = [(Xy , "Train" ), (Xy_valid , "Valid" )],
565+ evals_result = evals_result ,
566+ )
567+
568+ # Evaluation dataset should have the exact same performance as the training dataset.
569+ assert_allclose (
570+ device , evals_result ["Train" ]["rmse" ], evals_result ["Valid" ]["rmse" ]
571+ )
0 commit comments