From ac08cea16ec921a3f3407b4436381d8ed62e1493 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 4 Nov 2024 17:39:49 +0800 Subject: [PATCH] [EM] Add tests for irregular data shapes. --- python-package/xgboost/testing/data_iter.py | 20 +++++++++++++++++++- src/data/ellpack_page_source.cu | 5 +++-- tests/python-gpu/test_gpu_data_iterator.py | 6 +++++- tests/python/test_data_iterator.py | 7 ++++++- 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/python-package/xgboost/testing/data_iter.py b/python-package/xgboost/testing/data_iter.py index 924282dda872..bd612e2c3e84 100644 --- a/python-package/xgboost/testing/data_iter.py +++ b/python-package/xgboost/testing/data_iter.py @@ -6,7 +6,7 @@ from xgboost import testing as tm -from ..core import DataIter, ExtMemQuantileDMatrix, QuantileDMatrix +from ..core import DataIter, DMatrix, ExtMemQuantileDMatrix, QuantileDMatrix def run_mixed_sparsity(device: str) -> None: @@ -78,6 +78,24 @@ def reset(self) -> None: ExtMemQuantileDMatrix(it, enable_categorical=True) +def check_uneven_sizes(device: str) -> None: + """Tests for having irregular data shapes.""" + batches = [ + tm.make_regression(n_samples, 16, use_cupy=device == "cuda") + for n_samples in [512, 256, 1024] + ] + unzip = list(zip(*batches)) + it = tm.IteratorForTest(unzip[0], unzip[1], None, cache="cache", on_host=True) + + Xy = DMatrix(it) + assert Xy.num_col() == 16 + assert Xy.num_row() == sum(x.shape[0] for x in unzip[0]) + + Xy = ExtMemQuantileDMatrix(it) + assert Xy.num_col() == 16 + assert Xy.num_row() == sum(x.shape[0] for x in unzip[0]) + + class CatIter(DataIter): # pylint: disable=too-many-instance-attributes """An iterator for testing categorical features.""" diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu index 4901f900a7d5..5fb6fc925111 100644 --- a/src/data/ellpack_page_source.cu +++ b/src/data/ellpack_page_source.cu @@ -404,8 +404,9 @@ void ExtEllpackPageSourceImpl::Fetch() { this->GetCuts()}; this->info_->Extend(proxy_->Info(), false, true); }); - // The size of ellpack is logged in write cache. - LOG(INFO) << "Estimated batch size:" + LOG(INFO) << "Generated an Ellpack page with size: " + << common::HumanMemUnit(this->page_->Impl()->MemCostBytes()) + << " from an batch with estimated size: " << cuda_impl::Dispatch(proxy_, [](auto const& adapter) { return common::HumanMemUnit(adapter->SizeBytes()); }); diff --git a/tests/python-gpu/test_gpu_data_iterator.py b/tests/python-gpu/test_gpu_data_iterator.py index 83bf44ccdef5..b3e7254244b6 100644 --- a/tests/python-gpu/test_gpu_data_iterator.py +++ b/tests/python-gpu/test_gpu_data_iterator.py @@ -7,7 +7,7 @@ import xgboost as xgb from xgboost import testing as tm from xgboost.testing import no_cupy -from xgboost.testing.data_iter import check_invalid_cat_batches +from xgboost.testing.data_iter import check_invalid_cat_batches, check_uneven_sizes from xgboost.testing.updater import ( check_categorical_missing, check_categorical_ohe, @@ -231,3 +231,7 @@ def test_categorical_ohe(tree_method: str) -> None: @pytest.mark.skipif(**tm.no_cupy()) def test_invalid_cat_batches() -> None: check_invalid_cat_batches("cuda") + + +def test_uneven_sizes() -> None: + check_uneven_sizes("cuda") diff --git a/tests/python/test_data_iterator.py b/tests/python/test_data_iterator.py index c15ae77c1e19..545b849b4bdb 100644 --- a/tests/python/test_data_iterator.py +++ b/tests/python/test_data_iterator.py @@ -12,7 +12,7 @@ from xgboost import testing as tm from xgboost.data import SingleBatchInternalIter as SingleBatch from xgboost.testing import IteratorForTest, make_batches, non_increasing -from xgboost.testing.data_iter import check_invalid_cat_batches +from xgboost.testing.data_iter import check_invalid_cat_batches, check_uneven_sizes from xgboost.testing.updater import ( check_categorical_missing, check_categorical_ohe, @@ -375,3 +375,8 @@ def test_categorical_ohe(tree_method: str) -> None: def test_invalid_cat_batches() -> None: check_invalid_cat_batches("cpu") + + +@pytest.mark.skipif(**tm.no_cupy()) +def test_uneven_sizes() -> None: + check_uneven_sizes("cpu")