From 6b48d3ab38bbf9b208505ae94eb3c515b5f04cea Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 4 Nov 2024 18:01:07 +0800 Subject: [PATCH] Add documents. --- demo/dask/forward_logging.py | 3 ++- demo/guide-python/distributed_extmem_basic.py | 22 ++++++++++++++----- doc/conf.py | 1 + doc/tutorials/external_memory.rst | 18 +++++++++------ 4 files changed, 31 insertions(+), 13 deletions(-) diff --git a/demo/dask/forward_logging.py b/demo/dask/forward_logging.py index d49d8c1cbfe6..37189e8a429a 100644 --- a/demo/dask/forward_logging.py +++ b/demo/dask/forward_logging.py @@ -1,4 +1,5 @@ -"""Example of forwarding evaluation logs to the client +""" +Example of forwarding evaluation logs to the client =================================================== The example runs on GPU. Two classes are defined to show how to use Dask builtins to diff --git a/demo/guide-python/distributed_extmem_basic.py b/demo/guide-python/distributed_extmem_basic.py index 2ee9b33f6684..0a735286ae85 100644 --- a/demo/guide-python/distributed_extmem_basic.py +++ b/demo/guide-python/distributed_extmem_basic.py @@ -13,6 +13,7 @@ If `device` is `cuda`, following are also needed: - cupy +- python-cuda - rmm """ @@ -25,6 +26,7 @@ from typing import Callable, List, Tuple import numpy as np +from cuda import cudart from loky import get_reusable_executor from sklearn.datasets import make_regression @@ -104,11 +106,21 @@ def setup_rmm() -> None: if not xgboost.build_info()["USE_RMM"]: return - # The combination of pool and async is by design. As XGBoost needs to allocate large - # pages repeatly, it's not easy to handle fragmentation. We can use more experiments - # here. - mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource()) - rmm.mr.set_current_device_resource(mr) + try: + from rmm.mr import ArenaMemoryResource + + status, free, total = cudart.cudaMemGetInfo() + if status != cudart.cudaError_t.cudaSuccess: + raise RuntimeError(cudart.cudaGetErrorString(status)) + + mr = rmm.mr.CudaMemoryResource() + mr = ArenaMemoryResource(mr, arena_size=int(total * 0.9)) + except ImportError: + # The combination of pool and async is by design. As XGBoost needs to allocate + # large pages repeatly, it's not easy to handle fragmentation. We can use more + # experiments here. + mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource()) + rmm.mr.set_current_device_resource(mr) # Set the allocator for cupy as well. cp.cuda.set_allocator(rmm_cupy_allocator) diff --git a/doc/conf.py b/doc/conf.py index a2546cbbc336..89dc0f4eaee2 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -294,6 +294,7 @@ def is_readthedocs_build(): "dask": ("https://docs.dask.org/en/stable/", None), "distributed": ("https://distributed.dask.org/en/stable/", None), "pyspark": ("https://spark.apache.org/docs/latest/api/python/", None), + "rmm": ("https://docs.rapids.ai/api/rmm/nightly/", None), } diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst index c0fa7fd98769..9e641f8a9ca3 100644 --- a/doc/tutorials/external_memory.rst +++ b/doc/tutorials/external_memory.rst @@ -138,6 +138,8 @@ the GPU. Following is a snippet from :ref:`sphx_glr_python_examples_external_mem # It's important to use RMM for GPU-based external memory to improve performance. # If XGBoost is not built with RMM support, a warning will be raised. + # We use the pool memory resource here, you can also try the `ArenaMemoryResource` for + # improved memory fragmentation handling. mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource()) rmm.mr.set_current_device_resource(mr) # Set the allocator for cupy as well. @@ -278,13 +280,15 @@ determines the time it takes to run inference, even if a C2C link is available. Xy_valid = xgboost.ExtMemQuantileDMatrix(it_valid, max_bin=n_bins, ref=Xy_train) In addition, since the GPU implementation relies on asynchronous memory pool, which is -subject to memory fragmentation even if the ``CudaAsyncMemoryResource`` is used. You might -want to start the training with a fresh pool instead of starting training right after the -ETL process. If you run into out-of-memory errors and you are convinced that the pool is -not full yet (pool memory usage can be profiled with ``nsight-system``), consider tuning -the RMM memory resource like using ``rmm.mr.CudaAsyncMemoryResource`` in conjunction with -``rmm.mr.BinningMemoryResource(mr, 21, 25)`` instead of the -``rmm.mr.PoolMemoryResource(mr)`` shown in the example. +subject to memory fragmentation even if the :py:class:`CudaAsyncMemoryResource` is +used. You might want to start the training with a fresh pool instead of starting training +right after the ETL process. If you run into out-of-memory errors and you are convinced +that the pool is not full yet (pool memory usage can be profiled with ``nsight-system``), +consider tuning the RMM memory resource like using +:py:class:`rmm.mr.CudaAsyncMemoryResource` in conjunction with +:py:class:`rmm.mr.BinningMemoryResource(mr, 21, 25)` instead of the +:py:class:`rmm.mr.PoolMemoryResource(mr)`. Alternately, the +:py:class:`rmm.mr.ArenaMemoryResource` is also an excellent option. During CPU benchmarking, we used an NVMe connected to a PCIe-4 slot. Other types of storage can be too slow for practical usage. However, your system will likely perform some