Skip to content

Commit 24e19e7

Browse files
authored
Add datagen for testing string-based categorical data. (#11114)
1 parent dc092ae commit 24e19e7

File tree

8 files changed

+176
-108
lines changed

8 files changed

+176
-108
lines changed

python-package/xgboost/data.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -995,11 +995,7 @@ def _from_uri(
995995
_warn_unused_missing(data, missing)
996996
handle = ctypes.c_void_p()
997997
data = os.fspath(os.path.expanduser(data))
998-
args = {
999-
"uri": str(data),
1000-
"data_split_mode": int(data_split_mode),
1001-
}
1002-
config = bytes(json.dumps(args), "utf-8")
998+
config = make_jcargs(uri=str(data), data_split_mode=int(data_split_mode))
1003999
_check_call(_LIB.XGDMatrixCreateFromURI(config, ctypes.byref(handle)))
10041000
return handle, feature_names, feature_types
10051001

python-package/xgboost/testing/__init__.py

Lines changed: 4 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -37,20 +37,19 @@
3737
import xgboost as xgb
3838
from xgboost import RabitTracker
3939
from xgboost.core import ArrayLike
40-
from xgboost.data import is_pd_cat_dtype
4140
from xgboost.sklearn import SklObjective
42-
from xgboost.testing.data import (
41+
42+
from .._typing import PathLike
43+
from .data import (
4344
get_california_housing,
4445
get_cancer,
4546
get_digits,
4647
get_sparse,
4748
make_batches,
49+
make_categorical,
4850
make_sparse_regression,
49-
memory,
5051
)
5152

52-
from .._typing import PathLike
53-
5453
hypothesis = pytest.importorskip("hypothesis")
5554

5655
# pylint:disable=wrong-import-position,wrong-import-order
@@ -377,81 +376,6 @@ def __repr__(self) -> str:
377376
return self.name
378377

379378

380-
# pylint: disable=too-many-arguments,too-many-locals
381-
@memory.cache
382-
def make_categorical(
383-
n_samples: int,
384-
n_features: int,
385-
n_categories: int,
386-
*,
387-
onehot: bool,
388-
sparsity: float = 0.0,
389-
cat_ratio: float = 1.0,
390-
shuffle: bool = False,
391-
random_state: int = 1994,
392-
) -> Tuple[ArrayLike, np.ndarray]:
393-
"""Generate categorical features for test.
394-
395-
Parameters
396-
----------
397-
n_categories:
398-
Number of categories for categorical features.
399-
onehot:
400-
Should we apply one-hot encoding to the data?
401-
sparsity:
402-
The ratio of the amount of missing values over the number of all entries.
403-
cat_ratio:
404-
The ratio of features that are categorical.
405-
shuffle:
406-
Whether we should shuffle the columns.
407-
408-
Returns
409-
-------
410-
X, y
411-
"""
412-
import pandas as pd
413-
414-
rng = np.random.RandomState(random_state)
415-
416-
pd_dict = {}
417-
for i in range(n_features + 1):
418-
c = rng.randint(low=0, high=n_categories, size=n_samples)
419-
pd_dict[str(i)] = pd.Series(c, dtype=np.int64)
420-
421-
df = pd.DataFrame(pd_dict)
422-
label = df.iloc[:, 0]
423-
df = df.iloc[:, 1:]
424-
for i in range(0, n_features):
425-
label += df.iloc[:, i]
426-
label += 1
427-
428-
categories = np.arange(0, n_categories)
429-
for col in df.columns:
430-
if rng.binomial(1, cat_ratio, size=1)[0] == 1:
431-
df[col] = df[col].astype("category")
432-
df[col] = df[col].cat.set_categories(categories)
433-
434-
if sparsity > 0.0:
435-
for i in range(n_features):
436-
index = rng.randint(
437-
low=0, high=n_samples - 1, size=int(n_samples * sparsity)
438-
)
439-
df.iloc[index, i] = np.nan
440-
if is_pd_cat_dtype(df.dtypes.iloc[i]):
441-
assert n_categories == np.unique(df.dtypes.iloc[i].categories).size
442-
443-
assert df.shape[1] == n_features
444-
if onehot:
445-
df = pd.get_dummies(df)
446-
447-
if shuffle:
448-
columns = list(df.columns)
449-
rng.shuffle(columns)
450-
df = df[columns]
451-
452-
return df, label
453-
454-
455379
def make_ltr(
456380
n_samples: int,
457381
n_features: int,

python-package/xgboost/testing/data.py

Lines changed: 119 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
# pylint: disable=invalid-name
1+
# pylint: disable=invalid-name, too-many-lines
22
"""Utilities for data generation."""
33
import multiprocessing
44
import os
5+
import string
56
import zipfile
67
from concurrent.futures import ThreadPoolExecutor
78
from dataclasses import dataclass
@@ -14,6 +15,7 @@
1415
List,
1516
NamedTuple,
1617
Optional,
18+
Set,
1719
Tuple,
1820
Type,
1921
Union,
@@ -26,8 +28,10 @@
2628
from numpy.random import Generator as RNG
2729
from scipy import sparse
2830

29-
import xgboost
30-
from xgboost.data import pandas_pyarrow_mapper
31+
from ..core import DMatrix, QuantileDMatrix
32+
from ..data import is_pd_cat_dtype, pandas_pyarrow_mapper
33+
from ..sklearn import ArrayLike, XGBRanker
34+
from ..training import train as train_fn
3135

3236
if TYPE_CHECKING:
3337
from ..compat import DataFrame as DataFrameT
@@ -42,7 +46,7 @@ def np_dtypes(
4246
n_samples: int, n_features: int
4347
) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]:
4448
"""Enumerate all supported dtypes from numpy."""
45-
import pandas as pd
49+
pd = pytest.importorskip("pandas")
4650

4751
rng = np.random.RandomState(1994)
4852
# Integer and float.
@@ -99,7 +103,7 @@ def np_dtypes(
99103

100104
def pd_dtypes() -> Generator:
101105
"""Enumerate all supported pandas extension types."""
102-
import pandas as pd
106+
pd = pytest.importorskip("pandas")
103107

104108
# Integer
105109
dtypes = [
@@ -162,8 +166,8 @@ def pd_dtypes() -> Generator:
162166

163167
def pd_arrow_dtypes() -> Generator:
164168
"""Pandas DataFrame with pyarrow backed type."""
165-
import pandas as pd
166-
import pyarrow as pa
169+
pd = pytest.importorskip("pandas")
170+
pa = pytest.importorskip("pyarrow")
167171

168172
# Integer
169173
dtypes = pandas_pyarrow_mapper
@@ -225,10 +229,10 @@ def check_inf(rng: RNG) -> None:
225229
X[5, 2] = np.inf
226230

227231
with pytest.raises(ValueError, match="Input data contains `inf`"):
228-
xgboost.QuantileDMatrix(X, y)
232+
QuantileDMatrix(X, y)
229233

230234
with pytest.raises(ValueError, match="Input data contains `inf`"):
231-
xgboost.DMatrix(X, y)
235+
DMatrix(X, y)
232236

233237

234238
@memory.cache
@@ -288,8 +292,10 @@ def get_ames_housing() -> Tuple[DataFrameT, np.ndarray]:
288292
Number of categorical features: 10
289293
Number of numerical features: 10
290294
"""
291-
pytest.importorskip("pandas")
292-
import pandas as pd
295+
if TYPE_CHECKING:
296+
import pandas as pd
297+
else:
298+
pd = pytest.importorskip("pandas")
293299

294300
rng = np.random.default_rng(1994)
295301
n_samples = 1460
@@ -664,7 +670,7 @@ def init_rank_score(
664670
y_train = y_train[sorted_idx]
665671
qid_train = qid_train[sorted_idx]
666672

667-
ltr = xgboost.XGBRanker(objective="rank:ndcg", tree_method="hist")
673+
ltr = XGBRanker(objective="rank:ndcg", tree_method="hist")
668674
ltr.fit(X_train, y_train, qid=qid_train)
669675

670676
# Use the original order of the data.
@@ -799,9 +805,7 @@ def sort_ltr_samples(
799805
return data
800806

801807

802-
def run_base_margin_info(
803-
DType: Callable, DMatrixT: Type[xgboost.DMatrix], device: str
804-
) -> None:
808+
def run_base_margin_info(DType: Callable, DMatrixT: Type[DMatrix], device: str) -> None:
805809
"""Run tests for base margin."""
806810
rng = np.random.default_rng()
807811
X = DType(rng.normal(0, 1.0, size=100).astype(np.float32).reshape(50, 2))
@@ -814,7 +818,7 @@ def run_base_margin_info(
814818
Xy = DMatrixT(X, y, base_margin=base_margin)
815819
# Error at train, caused by check in predictor.
816820
with pytest.raises(ValueError, match=r".*base_margin.*"):
817-
xgboost.train({"tree_method": "hist", "device": device}, Xy)
821+
train_fn({"tree_method": "hist", "device": device}, Xy)
818822

819823
if not hasattr(X, "iloc"):
820824
# column major matrix
@@ -932,3 +936,102 @@ def random_csc(t_id: int) -> sparse.csc_matrix:
932936
return arr, y
933937

934938
return csr, y
939+
940+
941+
def unique_random_strings(n_strings: int, seed: int) -> List[str]:
942+
"""Generate n unique strings."""
943+
name_len = 8 # hardcoded, should be more than enough
944+
unique_strings: Set[str] = set()
945+
rng = np.random.default_rng(seed)
946+
947+
while len(unique_strings) < n_strings:
948+
random_str = "".join(
949+
rng.choice(list(string.ascii_letters), size=name_len, replace=True)
950+
)
951+
unique_strings.add(random_str)
952+
953+
return list(unique_strings)
954+
955+
956+
# pylint: disable=too-many-arguments,too-many-locals,too-many-branches
957+
def make_categorical(
958+
n_samples: int,
959+
n_features: int,
960+
n_categories: int,
961+
*,
962+
onehot: bool,
963+
sparsity: float = 0.0,
964+
cat_ratio: float = 1.0,
965+
shuffle: bool = False,
966+
random_state: int = 1994,
967+
cat_dtype: np.typing.DTypeLike = np.int64,
968+
) -> Tuple[ArrayLike, np.ndarray]:
969+
"""Generate categorical features for test.
970+
971+
Parameters
972+
----------
973+
n_categories:
974+
Number of categories for categorical features.
975+
onehot:
976+
Should we apply one-hot encoding to the data?
977+
sparsity:
978+
The ratio of the amount of missing values over the number of all entries.
979+
cat_ratio:
980+
The ratio of features that are categorical.
981+
shuffle:
982+
Whether we should shuffle the columns.
983+
cat_dtype :
984+
The dtype for categorical features, might be string or numeric.
985+
986+
Returns
987+
-------
988+
X, y
989+
"""
990+
pd = pytest.importorskip("pandas")
991+
992+
rng = np.random.RandomState(random_state)
993+
994+
df = pd.DataFrame()
995+
for i in range(n_features):
996+
choice = rng.binomial(1, cat_ratio, size=1)[0]
997+
if choice == 1:
998+
if np.issubdtype(cat_dtype, np.str_):
999+
categories = np.array(unique_random_strings(n_categories, i))
1000+
c = rng.choice(categories, size=n_samples, replace=True)
1001+
else:
1002+
categories = np.arange(0, n_categories)
1003+
c = rng.randint(low=0, high=n_categories, size=n_samples)
1004+
1005+
df[str(i)] = pd.Series(c, dtype="category")
1006+
df[str(i)] = df[str(i)].cat.set_categories(categories)
1007+
else:
1008+
num = rng.randint(low=0, high=n_categories, size=n_samples)
1009+
df[str(i)] = pd.Series(num, dtype=num.dtype)
1010+
1011+
label = np.zeros(shape=(n_samples,))
1012+
for col in df.columns:
1013+
if isinstance(df[col].dtype, pd.CategoricalDtype):
1014+
label += df[col].cat.codes
1015+
else:
1016+
label += df[col]
1017+
label += 1
1018+
1019+
if sparsity > 0.0:
1020+
for i in range(n_features):
1021+
index = rng.randint(
1022+
low=0, high=n_samples - 1, size=int(n_samples * sparsity)
1023+
)
1024+
df.iloc[index, i] = np.nan
1025+
if is_pd_cat_dtype(df.dtypes.iloc[i]):
1026+
assert n_categories == np.unique(df.dtypes.iloc[i].categories).size
1027+
1028+
assert df.shape[1] == n_features
1029+
if onehot:
1030+
df = pd.get_dummies(df)
1031+
1032+
if shuffle:
1033+
columns = list(df.columns)
1034+
rng.shuffle(columns)
1035+
df = df[columns]
1036+
1037+
return df, label

python-package/xgboost/testing/quantile_dmatrix.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
"""QuantileDMatrix related tests."""
22

33
import numpy as np
4+
import pytest
45
from sklearn.model_selection import train_test_split
56

67
import xgboost as xgb
78

8-
from .data import make_batches
9+
from .data import make_batches, make_categorical
910

1011

1112
def check_ref_quantile_cut(device: str) -> None:
@@ -33,3 +34,35 @@ def check_ref_quantile_cut(device: str) -> None:
3334
Xy_valid = xgb.QuantileDMatrix(X_valid, y_valid)
3435
cut_valid = Xy_valid.get_quantile_cut()
3536
assert not np.allclose(cut_train[1], cut_valid[1])
37+
38+
39+
def check_categorical_strings(device: str) -> None:
40+
"""Check string inputs."""
41+
if device == "cpu":
42+
pd = pytest.importorskip("pandas")
43+
else:
44+
pd = pytest.importorskip("cudf")
45+
46+
n_categories = 32
47+
X, y = make_categorical(
48+
1024,
49+
8,
50+
n_categories,
51+
onehot=False,
52+
cat_dtype=np.str_,
53+
cat_ratio=0.5,
54+
shuffle=True,
55+
)
56+
X = pd.DataFrame(X)
57+
58+
Xy = xgb.QuantileDMatrix(X, y, enable_categorical=True)
59+
assert Xy.num_col() == 8
60+
cuts = Xy.get_quantile_cut()
61+
indptr = cuts[0]
62+
values = cuts[1]
63+
for i in range(1, len(indptr)):
64+
f_idx = i - 1
65+
if isinstance(X[X.columns[f_idx]].dtype, pd.CategoricalDtype):
66+
beg, end = indptr[f_idx], indptr[i]
67+
col = values[beg:end]
68+
np.testing.assert_allclose(col, np.arange(0, n_categories))

0 commit comments

Comments
 (0)