Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement max_cat_threshold for CPU. #7957

Merged
merged 5 commits into from
Jun 4, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions doc/parameter.rst
Original file line number Diff line number Diff line change
Expand Up @@ -235,21 +235,36 @@ Parameters for Tree Booster
list is a group of indices of features that are allowed to interact with each other.
See :doc:`/tutorials/feature_interaction_constraint` for more information.

Additional parameters for ``hist``, ``gpu_hist`` and ``approx`` tree method
===========================================================================
.. _cat-param:

Parameters for Categorical Feature
==================================

These parameters are only used for training with categorical data. See
:doc:`/tutorials/categorical` for more information.

* ``max_cat_to_onehot``

.. versionadded:: 1.6

.. note:: The support for this parameter is experimental.
.. note:: This parameter is experimental. ``exact`` tree method is not supported yet.

- A threshold for deciding whether XGBoost should use one-hot encoding based split for
categorical data. When number of categories is lesser than the threshold then one-hot
encoding is chosen, otherwise the categories will be partitioned into children nodes.
Only relevant for regression and binary classification. Also, ``exact`` tree method is
not supported

* ``max_cat_threshold``

.. versionadded:: 2.0

.. note:: This parameter is experimental. ``exact`` and ``gpu_hist`` tree methods are
not supported yet.

- Maximum number of categories considered for each split. Used only by partition-based
splits for preventing over-fitting.

Additional parameters for Dart Booster (``booster=dart``)
=========================================================

Expand Down
2 changes: 1 addition & 1 deletion doc/tutorials/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ group the categories that output similar leaf values. During split finding, we f
the gradient histogram to prepare the contiguous partitions then enumerate the splits
according to these sorted values. One of the related parameters for XGBoost is
``max_cat_to_one_hot``, which controls whether one-hot encoding or partitioning should be
used for each feature, see :doc:`/parameter` for details.
used for each feature, see :ref:`cat-param` for details.


**********************
Expand Down
2 changes: 1 addition & 1 deletion src/common/categorical.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ inline XGBOOST_DEVICE bool InvalidCat(float cat) {
*/
template <bool validate = true>
inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat, bool dft_left) {
CLBitField32 const s_cats(cats);
KCatBitField const s_cats(cats);
// FIXME: Size() is not accurate since it represents the size of bit set instead of
// actual number of categories.
if (XGBOOST_EXPECT(validate && (InvalidCat(cat) || cat >= s_cats.Size()), false)) {
Expand Down
15 changes: 8 additions & 7 deletions src/tree/hist/evaluate_splits.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,23 +144,24 @@ class HistEvaluator {

auto const &cut_ptr = cut.Ptrs();
auto const &parent = snode_[nidx];
bst_bin_t n_bins{static_cast<bst_bin_t>(cut_ptr[fidx + 1] - cut_ptr[fidx])};
bst_bin_t n_bins_feature{static_cast<bst_bin_t>(cut_ptr[fidx + 1] - cut_ptr[fidx])};
auto n_bins = std::min(param_.max_cat_threshold, n_bins_feature);

// statistics on both sides of split
GradStats left_sum;
GradStats right_sum;
// best split so far
SplitEntry best;

auto f_hist = hist.subspan(cut_ptr[fidx], n_bins);
auto f_hist = hist.subspan(cut_ptr[fidx], n_bins_feature);
bst_bin_t ibegin, iend;
bst_bin_t f_begin = cut_ptr[fidx];
if (d_step > 0) {
ibegin = f_begin;
iend = ibegin + n_bins - 1;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure about removing the -1 here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, this avoids splitting with only missing value gradient. Fixed.

iend = ibegin + n_bins;
} else {
ibegin = static_cast<bst_bin_t>(cut_ptr[fidx + 1]) - 1;
iend = f_begin;
iend = ibegin - n_bins;
}

bst_bin_t best_thresh{-1};
Expand All @@ -177,7 +178,7 @@ class HistEvaluator {
auto loss_chg =
evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{left_sum}, GradStats{right_sum}) -
parent.root_gain;
// We don't have a numeric split point, nan hare is a dummy split.
// We don't have a numeric split point, nan here is a dummy split.
if (best.Update(loss_chg, fidx, std::numeric_limits<float>::quiet_NaN(), d_step == 1, true,
left_sum, right_sum)) {
best_thresh = i;
Expand All @@ -186,10 +187,10 @@ class HistEvaluator {
}

if (best_thresh != -1) {
auto n = common::CatBitField::ComputeStorageSize(n_bins + 1);
auto n = common::CatBitField::ComputeStorageSize(n_bins_feature + 1);
best.cat_bits = decltype(best.cat_bits)(n, 0);
common::CatBitField cat_bits{best.cat_bits};
bst_bin_t partition = d_step == 1 ? (best_thresh - ibegin + 1) : best_thresh - iend;
bst_bin_t partition = d_step == 1 ? (best_thresh - ibegin + 1) : (best_thresh - f_begin);
std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition,
[&](size_t c) { cat_bits.Set(c); });
}
Expand Down
8 changes: 8 additions & 0 deletions src/tree/param.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ struct TrainParam : public XGBoostParameter<TrainParam> {

uint32_t max_cat_to_onehot{4};

bst_bin_t max_cat_threshold{64};

//----- the rest parameters are less important ----
// minimum amount of hessian(weight) allowed in a child
float min_child_weight;
Expand Down Expand Up @@ -113,6 +115,12 @@ struct TrainParam : public XGBoostParameter<TrainParam> {
.set_default(4)
.set_lower_bound(1)
.describe("Maximum number of categories to use one-hot encoding based split.");
DMLC_DECLARE_FIELD(max_cat_threshold)
.set_default(64)
.set_lower_bound(1)
.describe(
"Maximum number of categories considered for split. Used only by partition-based"
"splits.");
DMLC_DECLARE_FIELD(min_child_weight)
.set_lower_bound(0.0f)
.set_default(1.0f)
Expand Down
6 changes: 3 additions & 3 deletions tests/python-gpu/test_gpu_updaters.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ def test_sparse(self, dataset):
strategies.integers(1, 2), strategies.integers(4, 7))
@settings(deadline=None, print_blob=True)
@pytest.mark.skipif(**tm.no_pandas())
def test_categorical(self, rows, cols, rounds, cats):
self.cputest.run_categorical_basic(rows, cols, rounds, cats, "gpu_hist")
def test_categorical_ohe(self, rows, cols, rounds, cats):
self.cputest.run_categorical_ohe(rows, cols, rounds, cats, "gpu_hist")

@given(
strategies.integers(10, 400),
Expand All @@ -96,7 +96,7 @@ def test_categorical_32_cat(self):
cols = 10
cats = 32
rounds = 4
self.cputest.run_categorical_basic(rows, cols, rounds, cats, "gpu_hist")
self.cputest.run_categorical_ohe(rows, cols, rounds, cats, "gpu_hist")

@pytest.mark.skipif(**tm.no_cupy())
def test_invalid_category(self):
Expand Down
62 changes: 58 additions & 4 deletions tests/python/test_updaters.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@
x['max_depth'] > 0 or x['grow_policy'] == 'lossguide'))


cat_parameter_strategy = strategies.fixed_dictionaries(
{
"max_cat_to_onehot": strategies.integers(1, 128),
"max_cat_threshold": strategies.integers(1, 128),
}
)


def train_result(param, dmat, num_rounds):
result = {}
xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
Expand Down Expand Up @@ -253,7 +261,7 @@ def run(max_cat_to_onehot: int):
# Test with partition-based split
run(self.USE_PART)

def run_categorical_basic(self, rows, cols, rounds, cats, tree_method):
def run_categorical_ohe(self, rows, cols, rounds, cats, tree_method):
onehot, label = tm.make_categorical(rows, cols, cats, True)
cat, _ = tm.make_categorical(rows, cols, cats, False)

Expand Down Expand Up @@ -328,9 +336,55 @@ def run_categorical_basic(self, rows, cols, rounds, cats, tree_method):
strategies.integers(1, 2), strategies.integers(4, 7))
@settings(deadline=None, print_blob=True)
@pytest.mark.skipif(**tm.no_pandas())
def test_categorical(self, rows, cols, rounds, cats):
self.run_categorical_basic(rows, cols, rounds, cats, "approx")
self.run_categorical_basic(rows, cols, rounds, cats, "hist")
def test_categorical_ohe(self, rows, cols, rounds, cats):
self.run_categorical_ohe(rows, cols, rounds, cats, "approx")
self.run_categorical_ohe(rows, cols, rounds, cats, "hist")

@given(
tm.categorical_dataset_strategy,
exact_parameter_strategy,
hist_parameter_strategy,
cat_parameter_strategy,
strategies.integers(4, 32),
strategies.sampled_from(["hist", "approx"]),
)
@settings(deadline=None, print_blob=True)
@pytest.mark.skipif(**tm.no_pandas())
def test_categorical(
self,
dataset: tm.TestDataset,
exact_parameters: Dict[str, Any],
hist_parameters: Dict[str, Any],
cat_parameters: Dict[str, Any],
n_rounds: int,
tree_method: str,
) -> None:
cat_parameters.update(exact_parameters)
cat_parameters.update(hist_parameters)
cat_parameters["tree_method"] = tree_method

results = train_result(cat_parameters, dataset.get_dmat(), n_rounds)
tm.non_increasing(results["train"]["rmse"])

@given(
hist_parameter_strategy,
cat_parameter_strategy,
strategies.sampled_from(["hist", "approx"]),
)
@settings(deadline=None, print_blob=True)
def test_categorical_ames_housing(
self,
hist_parameters: Dict[str, Any],
cat_parameters: Dict[str, Any],
tree_method: str,
) -> None:
cat_parameters.update(hist_parameters)
dataset = tm.TestDataset(
"ames_housing", tm.get_ames_housing, "reg:squarederror", "rmse"
)
cat_parameters["tree_method"] = tree_method
results = train_result(cat_parameters, dataset.get_dmat(), 16)
tm.non_increasing(results["train"]["rmse"])

@given(
strategies.integers(10, 400),
Expand Down
82 changes: 80 additions & 2 deletions tests/python/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,9 @@ def set_params(self, params_in):
return params_in

def get_dmat(self):
return xgb.DMatrix(self.X, self.y, self.w, base_margin=self.margin)
return xgb.DMatrix(
self.X, self.y, self.w, base_margin=self.margin, enable_categorical=True
)

def get_device_dmat(self):
w = None if self.w is None else cp.array(self.w)
Expand Down Expand Up @@ -277,6 +279,48 @@ def get_sparse():
return X, y


@memory.cache
def get_ames_housing():
"""
Number of samples: 1460
Number of features: 20
Number of categorical features: 10
Number of numerical features: 10
"""
from sklearn.datasets import fetch_openml
X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)

categorical_columns_subset: list[str] = [
"BldgType", # 5 cats, no nan
"GarageFinish", # 3 cats, nan
"LotConfig", # 5 cats, no nan
"Functional", # 7 cats, no nan
"MasVnrType", # 4 cats, nan
"HouseStyle", # 8 cats, no nan
"FireplaceQu", # 5 cats, nan
"ExterCond", # 5 cats, no nan
"ExterQual", # 4 cats, no nan
"PoolQC", # 3 cats, nan
]

numerical_columns_subset: list[str] = [
"3SsnPorch",
"Fireplaces",
"BsmtHalfBath",
"HalfBath",
"GarageCars",
"TotRmsAbvGrd",
"BsmtFinSF1",
"BsmtFinSF2",
"GrLivArea",
"ScreenPorch",
]

X = X[categorical_columns_subset + numerical_columns_subset]
X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
return X, y


@memory.cache
def get_mq2008(dpath):
from sklearn.datasets import load_svmlight_files
Expand Down Expand Up @@ -329,14 +373,48 @@ def make_categorical(
for i in range(n_features):
index = rng.randint(low=0, high=n_samples-1, size=int(n_samples * sparsity))
df.iloc[index, i] = np.NaN
assert df.iloc[:, i].isnull().values.any()
assert n_categories == np.unique(df.dtypes[i].categories).size

if onehot:
return pd.get_dummies(df), label
return df, label


def _cat_sampled_from():
@strategies.composite
def _make_cat(draw):
n_samples = draw(strategies.integers(2, 512))
n_features = draw(strategies.integers(1, 4))
n_cats = draw(strategies.integers(1, 128))
sparsity = draw(
strategies.floats(
min_value=0,
max_value=1,
allow_nan=False,
allow_infinity=False,
allow_subnormal=False,
)
)
return n_samples, n_features, n_cats, sparsity

def _build(args):
n_samples = args[0]
n_features = args[1]
n_cats = args[2]
sparsity = args[3]
return TestDataset(
f"{n_samples}x{n_features}-{n_cats}-{sparsity}",
lambda: make_categorical(n_samples, n_features, n_cats, False, sparsity),
"reg:squarederror",
"rmse",
)

return _make_cat().map(_build)


categorical_dataset_strategy = _cat_sampled_from()


@memory.cache
def make_sparse_regression(
n_samples: int, n_features: int, sparsity: float, as_dense: bool
Expand Down