From 6b48d3ab38bbf9b208505ae94eb3c515b5f04cea Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 4 Nov 2024 18:01:07 +0800
Subject: [PATCH] Add documents.

---
 demo/dask/forward_logging.py                  |  3 ++-
 demo/guide-python/distributed_extmem_basic.py | 22 ++++++++++++++-----
 doc/conf.py                                   |  1 +
 doc/tutorials/external_memory.rst             | 18 +++++++++------
 4 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/demo/dask/forward_logging.py b/demo/dask/forward_logging.py
index d49d8c1cbfe6..37189e8a429a 100644
--- a/demo/dask/forward_logging.py
+++ b/demo/dask/forward_logging.py
@@ -1,4 +1,5 @@
-"""Example of forwarding evaluation logs to the client
+"""
+Example of forwarding evaluation logs to the client
 ===================================================
 
 The example runs on GPU. Two classes are defined to show how to use Dask builtins to
diff --git a/demo/guide-python/distributed_extmem_basic.py b/demo/guide-python/distributed_extmem_basic.py
index 2ee9b33f6684..0a735286ae85 100644
--- a/demo/guide-python/distributed_extmem_basic.py
+++ b/demo/guide-python/distributed_extmem_basic.py
@@ -13,6 +13,7 @@
 If `device` is `cuda`, following are also needed:
 
 - cupy
+- python-cuda
 - rmm
 
 """
@@ -25,6 +26,7 @@
 from typing import Callable, List, Tuple
 
 import numpy as np
+from cuda import cudart
 from loky import get_reusable_executor
 from sklearn.datasets import make_regression
 
@@ -104,11 +106,21 @@ def setup_rmm() -> None:
     if not xgboost.build_info()["USE_RMM"]:
         return
 
-    # The combination of pool and async is by design. As XGBoost needs to allocate large
-    # pages repeatly, it's not easy to handle fragmentation. We can use more experiments
-    # here.
-    mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
-    rmm.mr.set_current_device_resource(mr)
+    try:
+        from rmm.mr import ArenaMemoryResource
+
+        status, free, total = cudart.cudaMemGetInfo()
+        if status != cudart.cudaError_t.cudaSuccess:
+            raise RuntimeError(cudart.cudaGetErrorString(status))
+
+        mr = rmm.mr.CudaMemoryResource()
+        mr = ArenaMemoryResource(mr, arena_size=int(total * 0.9))
+    except ImportError:
+        # The combination of pool and async is by design. As XGBoost needs to allocate
+        # large pages repeatly, it's not easy to handle fragmentation. We can use more
+        # experiments here.
+        mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
+        rmm.mr.set_current_device_resource(mr)
     # Set the allocator for cupy as well.
     cp.cuda.set_allocator(rmm_cupy_allocator)
 
diff --git a/doc/conf.py b/doc/conf.py
index a2546cbbc336..89dc0f4eaee2 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -294,6 +294,7 @@ def is_readthedocs_build():
     "dask": ("https://docs.dask.org/en/stable/", None),
     "distributed": ("https://distributed.dask.org/en/stable/", None),
     "pyspark": ("https://spark.apache.org/docs/latest/api/python/", None),
+    "rmm": ("https://docs.rapids.ai/api/rmm/nightly/", None),
 }
 
 
diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst
index c0fa7fd98769..9e641f8a9ca3 100644
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -138,6 +138,8 @@ the GPU. Following is a snippet from :ref:`sphx_glr_python_examples_external_mem
 
     # It's important to use RMM for GPU-based external memory to improve performance.
     # If XGBoost is not built with RMM support, a warning will be raised.
+    # We use the pool memory resource here, you can also try the `ArenaMemoryResource` for
+    # improved memory fragmentation handling.
     mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
     rmm.mr.set_current_device_resource(mr)
     # Set the allocator for cupy as well.
@@ -278,13 +280,15 @@ determines the time it takes to run inference, even if a C2C link is available.
     Xy_valid = xgboost.ExtMemQuantileDMatrix(it_valid, max_bin=n_bins, ref=Xy_train)
 
 In addition, since the GPU implementation relies on asynchronous memory pool, which is
-subject to memory fragmentation even if the ``CudaAsyncMemoryResource`` is used. You might
-want to start the training with a fresh pool instead of starting training right after the
-ETL process. If you run into out-of-memory errors and you are convinced that the pool is
-not full yet (pool memory usage can be profiled with ``nsight-system``), consider tuning
-the RMM memory resource like using ``rmm.mr.CudaAsyncMemoryResource`` in conjunction with
-``rmm.mr.BinningMemoryResource(mr, 21, 25)`` instead of the
-``rmm.mr.PoolMemoryResource(mr)`` shown in the example.
+subject to memory fragmentation even if the :py:class:`CudaAsyncMemoryResource` is
+used. You might want to start the training with a fresh pool instead of starting training
+right after the ETL process. If you run into out-of-memory errors and you are convinced
+that the pool is not full yet (pool memory usage can be profiled with ``nsight-system``),
+consider tuning the RMM memory resource like using
+:py:class:`rmm.mr.CudaAsyncMemoryResource` in conjunction with
+:py:class:`rmm.mr.BinningMemoryResource(mr, 21, 25)` instead of the
+:py:class:`rmm.mr.PoolMemoryResource(mr)`. Alternately, the
+:py:class:`rmm.mr.ArenaMemoryResource` is also an excellent option.
 
 During CPU benchmarking, we used an NVMe connected to a PCIe-4 slot. Other types of
 storage can be too slow for practical usage. However, your system will likely perform some