[EM] Add tests for irregular data shapes. (#10980)

- More tests. - Recommend arena in the document.
dmlc · Nov 5, 2024 · ccc5f05 · ccc5f05
1 parent 3b8f432
commit ccc5f05
Show file tree

Hide file tree

Showing 8 changed files with 64 additions and 18 deletions.
diff --git a/demo/dask/forward_logging.py b/demo/dask/forward_logging.py
@@ -1,4 +1,5 @@
-"""Example of forwarding evaluation logs to the client
+"""
+Example of forwarding evaluation logs to the client
 ===================================================
 
 The example runs on GPU. Two classes are defined to show how to use Dask builtins to

diff --git a/demo/guide-python/distributed_extmem_basic.py b/demo/guide-python/distributed_extmem_basic.py
@@ -13,6 +13,7 @@
 If `device` is `cuda`, following are also needed:
 
 - cupy
+- python-cuda
 - rmm
 
 """
@@ -104,11 +105,22 @@ def setup_rmm() -> None:
     if not xgboost.build_info()["USE_RMM"]:
         return
 
-    # The combination of pool and async is by design. As XGBoost needs to allocate large
-    # pages repeatly, it's not easy to handle fragmentation. We can use more experiments
-    # here.
-    mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
-    rmm.mr.set_current_device_resource(mr)
+    try:
+        from cuda import cudart
+        from rmm.mr import ArenaMemoryResource
+
+        status, free, total = cudart.cudaMemGetInfo()
+        if status != cudart.cudaError_t.cudaSuccess:
+            raise RuntimeError(cudart.cudaGetErrorString(status))
+
+        mr = rmm.mr.CudaMemoryResource()
+        mr = ArenaMemoryResource(mr, arena_size=int(total * 0.9))
+    except ImportError:
+        # The combination of pool and async is by design. As XGBoost needs to allocate
+        # large pages repeatly, it's not easy to handle fragmentation. We can use more
+        # experiments here.
+        mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
+        rmm.mr.set_current_device_resource(mr)
     # Set the allocator for cupy as well.
     cp.cuda.set_allocator(rmm_cupy_allocator)
 

diff --git a/doc/conf.py b/doc/conf.py
@@ -294,6 +294,7 @@ def is_readthedocs_build():
     "dask": ("https://docs.dask.org/en/stable/", None),
     "distributed": ("https://distributed.dask.org/en/stable/", None),
     "pyspark": ("https://spark.apache.org/docs/latest/api/python/", None),
+    "rmm": ("https://docs.rapids.ai/api/rmm/nightly/", None),
 }
 
 

diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst
@@ -138,6 +138,8 @@ the GPU. Following is a snippet from :ref:`sphx_glr_python_examples_external_mem
 
     # It's important to use RMM for GPU-based external memory to improve performance.
     # If XGBoost is not built with RMM support, a warning will be raised.
+    # We use the pool memory resource here, you can also try the `ArenaMemoryResource` for
+    # improved memory fragmentation handling.
     mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
     rmm.mr.set_current_device_resource(mr)
     # Set the allocator for cupy as well.
@@ -278,13 +280,15 @@ determines the time it takes to run inference, even if a C2C link is available.
     Xy_valid = xgboost.ExtMemQuantileDMatrix(it_valid, max_bin=n_bins, ref=Xy_train)
 
 In addition, since the GPU implementation relies on asynchronous memory pool, which is
-subject to memory fragmentation even if the ``CudaAsyncMemoryResource`` is used. You might
-want to start the training with a fresh pool instead of starting training right after the
-ETL process. If you run into out-of-memory errors and you are convinced that the pool is
-not full yet (pool memory usage can be profiled with ``nsight-system``), consider tuning
-the RMM memory resource like using ``rmm.mr.CudaAsyncMemoryResource`` in conjunction with
-``rmm.mr.BinningMemoryResource(mr, 21, 25)`` instead of the
-``rmm.mr.PoolMemoryResource(mr)`` shown in the example.
+subject to memory fragmentation even if the :py:class:`~rmm.mr.CudaAsyncMemoryResource` is
+used. You might want to start the training with a fresh pool instead of starting training
+right after the ETL process. If you run into out-of-memory errors and you are convinced
+that the pool is not full yet (pool memory usage can be profiled with ``nsight-system``),
+consider tuning the RMM memory resource like using
+:py:class:`~rmm.mr.CudaAsyncMemoryResource` in conjunction with
+:py:class:`BinningMemoryResource(mr, 21, 25) <rmm.mr.BinningMemoryResource>` instead of
+the :py:class:`~rmm.mr.PoolMemoryResource`. Alternately, the
+:py:class:`~rmm.mr.ArenaMemoryResource` is also an excellent option.
 
 During CPU benchmarking, we used an NVMe connected to a PCIe-4 slot. Other types of
 storage can be too slow for practical usage. However, your system will likely perform some

diff --git a/python-package/xgboost/testing/data_iter.py b/python-package/xgboost/testing/data_iter.py
@@ -6,7 +6,7 @@
 
 from xgboost import testing as tm
 
-from ..core import DataIter, ExtMemQuantileDMatrix, QuantileDMatrix
+from ..core import DataIter, DMatrix, ExtMemQuantileDMatrix, QuantileDMatrix
 
 
 def run_mixed_sparsity(device: str) -> None:
@@ -78,6 +78,24 @@ def reset(self) -> None:
         ExtMemQuantileDMatrix(it, enable_categorical=True)
 
 
+def check_uneven_sizes(device: str) -> None:
+    """Tests for having irregular data shapes."""
+    batches = [
+        tm.make_regression(n_samples, 16, use_cupy=device == "cuda")
+        for n_samples in [512, 256, 1024]
+    ]
+    unzip = list(zip(*batches))
+    it = tm.IteratorForTest(unzip[0], unzip[1], None, cache="cache", on_host=True)
+
+    Xy = DMatrix(it)
+    assert Xy.num_col() == 16
+    assert Xy.num_row() == sum(x.shape[0] for x in unzip[0])
+
+    Xy = ExtMemQuantileDMatrix(it)
+    assert Xy.num_col() == 16
+    assert Xy.num_row() == sum(x.shape[0] for x in unzip[0])
+
+
 class CatIter(DataIter):  # pylint: disable=too-many-instance-attributes
     """An iterator for testing categorical features."""
 

diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
@@ -404,8 +404,9 @@ void ExtEllpackPageSourceImpl<F>::Fetch() {
                                              this->GetCuts()};
       this->info_->Extend(proxy_->Info(), false, true);
     });
-    // The size of ellpack is logged in write cache.
-    LOG(INFO) << "Estimated batch size:"
+    LOG(INFO) << "Generated an Ellpack page with size: "
+              << common::HumanMemUnit(this->page_->Impl()->MemCostBytes())
+              << " from an batch with estimated size: "
               << cuda_impl::Dispatch<false>(proxy_, [](auto const& adapter) {
                    return common::HumanMemUnit(adapter->SizeBytes());
                  });

diff --git a/tests/python-gpu/test_gpu_data_iterator.py b/tests/python-gpu/test_gpu_data_iterator.py
@@ -7,7 +7,7 @@
 import xgboost as xgb
 from xgboost import testing as tm
 from xgboost.testing import no_cupy
-from xgboost.testing.data_iter import check_invalid_cat_batches
+from xgboost.testing.data_iter import check_invalid_cat_batches, check_uneven_sizes
 from xgboost.testing.updater import (
     check_categorical_missing,
     check_categorical_ohe,
@@ -231,3 +231,7 @@ def test_categorical_ohe(tree_method: str) -> None:
 @pytest.mark.skipif(**tm.no_cupy())
 def test_invalid_cat_batches() -> None:
     check_invalid_cat_batches("cuda")
+
+
+def test_uneven_sizes() -> None:
+    check_uneven_sizes("cuda")
diff --git a/tests/python/test_data_iterator.py b/tests/python/test_data_iterator.py
@@ -12,7 +12,7 @@
 from xgboost import testing as tm
 from xgboost.data import SingleBatchInternalIter as SingleBatch
 from xgboost.testing import IteratorForTest, make_batches, non_increasing
-from xgboost.testing.data_iter import check_invalid_cat_batches
+from xgboost.testing.data_iter import check_invalid_cat_batches, check_uneven_sizes
 from xgboost.testing.updater import (
     check_categorical_missing,
     check_categorical_ohe,
@@ -375,3 +375,8 @@ def test_categorical_ohe(tree_method: str) -> None:
 
 def test_invalid_cat_batches() -> None:
     check_invalid_cat_batches("cpu")
+
+
+@pytest.mark.skipif(**tm.no_cupy())
+def test_uneven_sizes() -> None:
+    check_uneven_sizes("cpu")