diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 06badec5f2e2..1a8098071ba3 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -12,7 +12,7 @@ updates:
   - package-ecosystem: "maven"
     directory: "/jvm-packages/xgboost4j"
     schedule:
-      interval: "daily"
+      interval: "monthly"
   - package-ecosystem: "maven"
     directory: "/jvm-packages/xgboost4j-gpu"
     schedule:
@@ -24,7 +24,7 @@ updates:
   - package-ecosystem: "maven"
     directory: "/jvm-packages/xgboost4j-spark"
     schedule:
-      interval: "daily"
+      interval: "monthly"
   - package-ecosystem: "maven"
     directory: "/jvm-packages/xgboost4j-spark-gpu"
     schedule:
diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml
index a7c71a590cb8..78dc92bffff6 100644
--- a/.github/workflows/i386.yml
+++ b/.github/workflows/i386.yml
@@ -23,7 +23,7 @@ jobs:
       with:
         submodules: 'true'
     - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v3
+      uses: docker/setup-buildx-action@v3.4.0
       with:
         driver-opts: network=host
     - name: Build and push container
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 001e17b510a3..0408f358fba5 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -180,7 +180,7 @@ jobs:
     - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
-    - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
+    - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
       with:
         python-version: "3.8"
         architecture: 'x64'
diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index 83f0ad495fc3..e232cd754f7b 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -319,7 +319,7 @@ jobs:
           submodules: 'true'
 
       - name: Set up Python 3.8
-        uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
+        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
         with:
           python-version: 3.8
 
diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml
index 9fb9d4684ad1..4298fd9c6a5d 100644
--- a/.github/workflows/r_tests.yml
+++ b/.github/workflows/r_tests.yml
@@ -84,7 +84,7 @@ jobs:
         key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
         restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
 
-    - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
+    - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
       with:
         python-version: "3.8"
         architecture: 'x64'
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 222700da4a58..a108f282214f 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -41,7 +41,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
+        uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # v4.3.4
         with:
           name: SARIF file
           path: results.sarif
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f7cf8a6cfa87..034d52164ad2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,7 +71,6 @@ option(HIDE_CXX_SYMBOLS "Build shared library and hide all C++ symbols" OFF)
 option(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR "Output build artifacts in CMake binary dir" OFF)
 ## CUDA
 option(USE_CUDA  "Build with GPU acceleration" OFF)
-option(USE_PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" ON)
 option(USE_NCCL  "Build with NCCL to enable distributed GPU support." OFF)
 # This is specifically designed for PyPI binary release and should be disabled for most of the cases.
 option(USE_DLOPEN_NCCL "Whether to load nccl dynamically." OFF)
diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index 5fbf479c5640..ed4b38f99ea7 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -113,6 +113,7 @@ OBJECTS= \
     $(PKGROOT)/src/common/charconv.o \
     $(PKGROOT)/src/common/column_matrix.o \
     $(PKGROOT)/src/common/common.o \
+    $(PKGROOT)/src/common/cuda_rt_utils.o \
     $(PKGROOT)/src/common/error_msg.o \
     $(PKGROOT)/src/common/hist_util.o \
     $(PKGROOT)/src/common/host_device_vector.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index a5a5c131edf3..d4330120f890 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -113,6 +113,7 @@ OBJECTS= \
     $(PKGROOT)/src/common/charconv.o \
     $(PKGROOT)/src/common/column_matrix.o \
     $(PKGROOT)/src/common/common.o \
+    $(PKGROOT)/src/common/cuda_rt_utils.o \
     $(PKGROOT)/src/common/error_msg.o \
     $(PKGROOT)/src/common/hist_util.o \
     $(PKGROOT)/src/common/host_device_vector.o \
diff --git a/README.md b/README.md
index b27cce673585..1126a588d8ae 100644
--- a/README.md
+++ b/README.md
@@ -4,8 +4,8 @@
 [![Build Status](https://badge.buildkite.com/aca47f40a32735c00a8550540c5eeff6a4c1d246a580cae9b0.svg?branch=master)](https://buildkite.com/xgboost/xgboost-ci)
 [![XGBoost-CI](https://github.com/dmlc/xgboost/workflows/XGBoost-CI/badge.svg?branch=master)](https://github.com/dmlc/xgboost/actions)
 [![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](https://xgboost.readthedocs.org)
-[![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
-[![CRAN Status Badge](http://www.r-pkg.org/badges/version/xgboost)](http://cran.r-project.org/web/packages/xgboost)
+[![GitHub license](https://dmlc.github.io/img/apache2.svg)](./LICENSE)
+[![CRAN Status Badge](https://www.r-pkg.org/badges/version/xgboost)](https://cran.r-project.org/web/packages/xgboost)
 [![PyPI version](https://badge.fury.io/py/xgboost.svg)](https://pypi.python.org/pypi/xgboost/)
 [![Conda version](https://img.shields.io/conda/vn/conda-forge/py-xgboost.svg)](https://anaconda.org/conda-forge/py-xgboost)
 [![Optuna](https://img.shields.io/badge/Optuna-integrated-blue)](https://optuna.org)
@@ -35,7 +35,7 @@ Checkout the [Community Page](https://xgboost.ai/community).
 
 Reference
 ---------
-- Tianqi Chen and Carlos Guestrin. [XGBoost: A Scalable Tree Boosting System](http://arxiv.org/abs/1603.02754). In 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016
+- Tianqi Chen and Carlos Guestrin. [XGBoost: A Scalable Tree Boosting System](https://arxiv.org/abs/1603.02754). In 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016
 - XGBoost originates from research project at University of Washington.
 
 Sponsors
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 9563ec3076b2..266cf29b38b2 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -80,12 +80,8 @@ function(xgboost_set_cuda_flags target)
     $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda>
     $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
     $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=${OpenMP_CXX_FLAGS}>
-    $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all>)
-
-  if(USE_PER_THREAD_DEFAULT_STREAM)
-    target_compile_options(${target} PRIVATE
-            $<$<COMPILE_LANGUAGE:CUDA>:--default-stream per-thread>)
-  endif()
+    $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all>
+    $<$<COMPILE_LANGUAGE:CUDA>:--default-stream per-thread>)
 
   if(FORCE_COLORED_OUTPUT)
     if(FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
diff --git a/dev/release-artifacts.py b/dev/release-artifacts.py
index 1e0b5723e89b..f53ef134630b 100644
--- a/dev/release-artifacts.py
+++ b/dev/release-artifacts.py
@@ -2,6 +2,7 @@
 
 tqdm, sh are required to run this script.
 """
+
 import argparse
 import os
 import shutil
@@ -106,6 +107,15 @@ def make_pysrc_wheel(
     if not os.path.exists(dist):
         os.mkdir(dist)
 
+    # Apply patch to remove NCCL dependency
+    # Save the original content of pyproject.toml so that we can restore it later
+    with DirectoryExcursion(ROOT):
+        with open("python-package/pyproject.toml", "r") as f:
+            orig_pyproj_lines = f.read()
+        with open("tests/buildkite/remove_nccl_dep.patch", "r") as f:
+            patch_lines = f.read()
+        subprocess.run(["patch", "-p0"], input=patch_lines, text=True)
+
     with DirectoryExcursion(os.path.join(ROOT, "python-package")):
         subprocess.check_call(["python", "-m", "build", "--sdist"])
         if rc is not None:
@@ -117,6 +127,10 @@ def make_pysrc_wheel(
         target = os.path.join(dist, name)
         shutil.move(src, target)
 
+    with DirectoryExcursion(ROOT):
+        with open("python-package/pyproject.toml", "w") as f:
+            print(orig_pyproj_lines, file=f, end="")
+
 
 def download_py_packages(
     branch: str, major: int, minor: int, commit_hash: str, outdir: str
diff --git a/doc/install.rst b/doc/install.rst
index e5229702e16c..79082a7ed581 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -76,6 +76,19 @@ Capabilities of binary wheels for each platform:
 | Windows             | |tick|  |  |cross|             |
 +---------------------+---------+----------------------+
 
+Minimal installation (CPU-only)
+*******************************
+The default installation with ``pip`` will install the full XGBoost package, including the support for the GPU algorithms and federated learning.
+
+You may choose to reduce the size of the installed package and save the disk space, by opting to install ``xgboost-cpu`` instead:
+
+.. code-block:: bash
+
+  pip install xgboost-cpu
+
+The ``xgboost-cpu`` variant will have drastically smaller disk footprint, but does not provide some features, such as the GPU algorithms and
+federated learning.
+
 Conda
 *****
 
diff --git a/include/xgboost/base.h b/include/xgboost/base.h
index 9abe72b87859..64aab5c41b0c 100644
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -7,6 +7,8 @@
 #define XGBOOST_BASE_H_
 
 #include <dmlc/omp.h>  // for omp_uint, omp_ulong
+// Put the windefs here to guard as many files as possible.
+#include <xgboost/windefs.h>
 
 #include <cstdint>  // for int32_t, uint64_t, int16_t
 #include <ostream>  // for ostream
diff --git a/include/xgboost/collective/poll_utils.h b/include/xgboost/collective/poll_utils.h
index a4d2fbacda27..41b674964efc 100644
--- a/include/xgboost/collective/poll_utils.h
+++ b/include/xgboost/collective/poll_utils.h
@@ -4,13 +4,14 @@
  * \author Tianqi Chen
  */
 #pragma once
-#include "xgboost/collective/result.h"
-#include "xgboost/collective/socket.h"
+#include <xgboost/collective/result.h>
+#include <xgboost/collective/socket.h>
 
 #if defined(_WIN32)
+#include <xgboost/windefs.h>
+// Socket API
 #include <winsock2.h>
 #include <ws2tcpip.h>
-
 #else
 
 #include <arpa/inet.h>
diff --git a/include/xgboost/collective/socket.h b/include/xgboost/collective/socket.h
index c5dd977f6255..bf5fffdaf155 100644
--- a/include/xgboost/collective/socket.h
+++ b/include/xgboost/collective/socket.h
@@ -1,12 +1,8 @@
 /**
- * Copyright (c) 2022-2024, XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  */
 #pragma once
 
-#if !defined(NOMINMAX) && defined(_WIN32)
-#define NOMINMAX
-#endif                   // !defined(NOMINMAX)
-
 #include <cerrno>        // errno, EINTR, EBADF
 #include <climits>       // HOST_NAME_MAX
 #include <cstddef>       // std::size_t
@@ -18,18 +14,12 @@
 
 #if defined(__linux__)
 #include <sys/ioctl.h>  // for TIOCOUTQ, FIONREAD
-#endif  // defined(__linux__)
-
-#if !defined(xgboost_IS_MINGW)
-
-#if defined(__MINGW32__)
-#define xgboost_IS_MINGW 1
-#endif  // defined(__MINGW32__)
-
-#endif  // xgboost_IS_MINGW
+#endif                  // defined(__linux__)
 
 #if defined(_WIN32)
-
+// Guard the include.
+#include <xgboost/windefs.h>
+// Socket API
 #include <winsock2.h>
 #include <ws2tcpip.h>
 
@@ -41,9 +31,9 @@ using in_port_t = std::uint16_t;
 
 #if !defined(xgboost_IS_MINGW)
 using ssize_t = int;
-#endif                    // !xgboost_IS_MINGW()
+#endif  // !xgboost_IS_MINGW()
 
-#else                     // UNIX
+#else  // UNIX
 
 #include <arpa/inet.h>    // inet_ntop
 #include <fcntl.h>        // fcntl, F_GETFL, O_NONBLOCK
@@ -839,7 +829,3 @@ Result INetNToP(H const &host, std::string *p_out) {
 }  // namespace xgboost
 
 #undef xgboost_CHECK_SYS_CALL
-
-#if defined(xgboost_IS_MINGW)
-#undef xgboost_IS_MINGW
-#endif
diff --git a/include/xgboost/windefs.h b/include/xgboost/windefs.h
new file mode 100644
index 000000000000..e7e743184a17
--- /dev/null
+++ b/include/xgboost/windefs.h
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2024, XGBoost Contributors
+ *
+ * @brief Macro for Windows.
+ */
+#pragma once
+
+#if !defined(xgboost_IS_WIN)
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#define xgboost_IS_WIN 1
+#endif  // defined(_MSC_VER) || defined(__MINGW32__)
+
+#endif  // !defined(xgboost_IS_WIN)
+
+#if defined(xgboost_IS_WIN)
+
+#if !defined(NOMINMAX)
+#define NOMINMAX
+#endif  // !defined(NOMINMAX)
+
+// A macro used inside `windows.h` to avoid conflicts with `winsock2.h`
+#define WIN32_LEAN_AND_MEAN
+
+#if !defined(xgboost_IS_MINGW)
+
+#if defined(__MINGW32__)
+#define xgboost_IS_MINGW 1
+#endif  // defined(__MINGW32__)
+
+#endif  // xgboost_IS_MINGW
+
+#endif  // defined(xgboost_IS_WIN)
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index fb6aaf020c3e..a154f2d489ae 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -37,17 +37,17 @@
         <junit.version>4.13.2</junit.version>
         <spark.version>3.5.1</spark.version>
         <spark.version.gpu>3.5.1</spark.version.gpu>
-        <fasterxml.jackson.version>2.15.2</fasterxml.jackson.version>
+        <fasterxml.jackson.version>2.17.2</fasterxml.jackson.version>
         <scala.version>2.12.18</scala.version>
         <scala.binary.version>2.12</scala.binary.version>
         <hadoop.version>3.4.0</hadoop.version>
         <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
         <log.capi.invocation>OFF</log.capi.invocation>
         <use.cuda>OFF</use.cuda>
-        <cudf.version>24.04.0</cudf.version>
-        <spark.rapids.version>24.04.1</spark.rapids.version>
+        <cudf.version>24.06.0</cudf.version>
+        <spark.rapids.version>24.06.0</spark.rapids.version>
         <cudf.classifier>cuda12</cudf.classifier>
-        <scalatest.version>3.2.18</scalatest.version>
+        <scalatest.version>3.2.19</scalatest.version>
         <scala-collection-compat.version>2.12.0</scala-collection-compat.version>
         <skip.native.build>false</skip.native.build>
 
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
index a273d8c135d8..8835def25858 100644
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -7,13 +7,13 @@ build-backend = "packager.pep517"
 
 [project]
 name = "xgboost"
-version = "2.2.0-dev"
+description = "XGBoost Python Package"
+readme = { file = "README.rst", content-type = "text/x-rst" }
 authors = [
     { name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
     { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
 ]
-description = "XGBoost Python Package"
-readme = { file = "README.rst", content-type = "text/x-rst" }
+version = "2.2.0-dev"
 requires-python = ">=3.8"
 license = { text = "Apache-2.0" }
 classifiers = [
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index e0096c89c9a8..b934e99e7981 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -45,6 +45,7 @@
     get_cancer,
     get_digits,
     get_sparse,
+    make_batches,
     memory,
 )
 
@@ -161,7 +162,16 @@ def no_cudf() -> PytestSkip:
 
 
 def no_cupy() -> PytestSkip:
-    return no_mod("cupy")
+    skip_cupy = no_mod("cupy")
+    if not skip_cupy["condition"] and system() == "Windows":
+        import cupy as cp
+
+        # Cupy might run into issue on Windows due to missing compiler
+        try:
+            cp.array([1, 2, 3]).sum()
+        except Exception:  # pylint: disable=broad-except
+            skip_cupy["condition"] = True
+    return skip_cupy
 
 
 def no_dask_cudf() -> PytestSkip:
@@ -248,35 +258,6 @@ def as_arrays(
         return X, y, w
 
 
-def make_batches(  # pylint: disable=too-many-arguments,too-many-locals
-    n_samples_per_batch: int,
-    n_features: int,
-    n_batches: int,
-    use_cupy: bool = False,
-    *,
-    vary_size: bool = False,
-    random_state: int = 1994,
-) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
-    X = []
-    y = []
-    w = []
-    if use_cupy:
-        import cupy
-
-        rng = cupy.random.RandomState(random_state)
-    else:
-        rng = np.random.RandomState(random_state)
-    for i in range(n_batches):
-        n_samples = n_samples_per_batch + i * 10 if vary_size else n_samples_per_batch
-        _X = rng.randn(n_samples, n_features)
-        _y = rng.randn(n_samples)
-        _w = rng.uniform(low=0, high=1, size=n_samples)
-        X.append(_X)
-        y.append(_y)
-        w.append(_w)
-    return X, y, w
-
-
 def make_regression(
     n_samples: int, n_features: int, use_cupy: bool
 ) -> Tuple[ArrayLike, ArrayLike, ArrayLike]:
diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py
index f4e97e59d363..4071219c44ef 100644
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@@ -9,6 +9,7 @@
     Callable,
     Dict,
     Generator,
+    List,
     NamedTuple,
     Optional,
     Tuple,
@@ -506,6 +507,36 @@ def get_mq2008(
     )
 
 
+def make_batches(  # pylint: disable=too-many-arguments,too-many-locals
+    n_samples_per_batch: int,
+    n_features: int,
+    n_batches: int,
+    use_cupy: bool = False,
+    *,
+    vary_size: bool = False,
+    random_state: int = 1994,
+) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
+    """Make batches of dense data."""
+    X = []
+    y = []
+    w = []
+    if use_cupy:
+        import cupy  # pylint: disable=import-error
+
+        rng = cupy.random.RandomState(random_state)
+    else:
+        rng = np.random.RandomState(random_state)
+    for i in range(n_batches):
+        n_samples = n_samples_per_batch + i * 10 if vary_size else n_samples_per_batch
+        _X = rng.randn(n_samples, n_features)
+        _y = rng.randn(n_samples)
+        _w = rng.uniform(low=0, high=1, size=n_samples)
+        X.append(_X)
+        y.append(_y)
+        w.append(_w)
+    return X, y, w
+
+
 RelData = Tuple[sparse.csr_matrix, npt.NDArray[np.int32], npt.NDArray[np.int32]]
 
 
diff --git a/src/c_api/c_api_error.h b/src/c_api/c_api_error.h
index 0ad4ac073dbd..a1928e6b14d7 100644
--- a/src/c_api/c_api_error.h
+++ b/src/c_api/c_api_error.h
@@ -7,10 +7,9 @@
 #define XGBOOST_C_API_C_API_ERROR_H_
 
 #include <dmlc/base.h>
-#include <dmlc/logging.h>
 
-#include "c_api_utils.h"
-#include "xgboost/collective/result.h"
+#include "c_api_utils.h"  // for XGBoostAPIGuard
+#include "xgboost/logging.h"
 
 /*! \brief  macro to guard beginning and end section of all functions */
 #ifdef LOG_CAPI_INVOCATION
diff --git a/src/cli_main.cc b/src/cli_main.cc
index 54a3450276f4..1c388cf845c2 100644
--- a/src/cli_main.cc
+++ b/src/cli_main.cc
@@ -4,29 +4,26 @@
  * \brief The command line interface program of xgboost.
  *  This file is not included in dynamic library.
  */
-#if !defined(NOMINMAX) && defined(_WIN32)
-#define NOMINMAX
-#endif  // !defined(NOMINMAX)
-
 #include <dmlc/timer.h>
-
-#include <xgboost/learner.h>
+#include <xgboost/base.h>
 #include <xgboost/data.h>
 #include <xgboost/json.h>
+#include <xgboost/learner.h>
 #include <xgboost/logging.h>
 #include <xgboost/parameter.h>
 
-#include <iomanip>
-#include <ctime>
-#include <string>
 #include <cstdio>
 #include <cstring>
+#include <ctime>
+#include <iomanip>
+#include <string>
 #include <vector>
+
+#include "c_api/c_api_utils.h"
 #include "common/common.h"
 #include "common/config.h"
 #include "common/io.h"
 #include "common/version.h"
-#include "c_api/c_api_utils.h"
 
 namespace xgboost {
 enum CLITask {
diff --git a/src/collective/tracker.cc b/src/collective/tracker.cc
index 6cb3601db7f4..bbc7a7c5a730 100644
--- a/src/collective/tracker.cc
+++ b/src/collective/tracker.cc
@@ -7,11 +7,10 @@
 #include <sys/socket.h>  // socket, AF_INET6, AF_INET, connect, getsockname
 #endif                   // defined(__unix__) || defined(__APPLE__)
 
-#if !defined(NOMINMAX) && defined(_WIN32)
-#define NOMINMAX
-#endif  // !defined(NOMINMAX)
-
 #if defined(_WIN32)
+// Guard the include
+#include <xgboost/windefs.h>
+// Socket API
 #include <winsock2.h>
 #include <ws2tcpip.h>
 #endif  // defined(_WIN32)
diff --git a/src/common/common.cc b/src/common/common.cc
index 086f4c00d167..10a667070da9 100644
--- a/src/common/common.cc
+++ b/src/common/common.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023 by Contributors
+ * Copyright 2015-2024, XGBoost Contributors
  */
 #include "common.h"
 
@@ -54,9 +54,4 @@ void EscapeU8(std::string const &string, std::string *p_buffer) {
     }
   }
 }
-
-#if !defined(XGBOOST_USE_CUDA)
-int AllVisibleGPUs() { return 0; }
-#endif  // !defined(XGBOOST_USE_CUDA)
-
 }  // namespace xgboost::common
diff --git a/src/common/common.cu b/src/common/common.cu
index b6965904a2b0..958f93779308 100644
--- a/src/common/common.cu
+++ b/src/common/common.cu
@@ -1,29 +1,21 @@
-/*!
- * Copyright 2018-2022 XGBoost contributors
+/**
+ * Copyright 2018-2024, XGBoost contributors
  */
-#include "common.h"
-
-namespace xgboost {
-namespace common {
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
 
-void SetDevice(std::int32_t device) {
-  if (device >= 0) {
-    dh::safe_cuda(cudaSetDevice(device));
-  }
-}
+#include "common.h"
 
-int AllVisibleGPUs() {
-  int n_visgpus = 0;
-  try {
-    // When compiled with CUDA but running on CPU only device,
-    // cudaGetDeviceCount will fail.
-    dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
-  } catch (const dmlc::Error &) {
-    cudaGetLastError();  // reset error.
-    return 0;
+namespace dh {
+void ThrowOnCudaError(cudaError_t code, const char *file, int line) {
+  if (code != cudaSuccess) {
+    std::string f;
+    if (file != nullptr) {
+      f = file;
+    }
+    LOG(FATAL) << thrust::system_error(code, thrust::cuda_category(),
+                                       f + ": " + std::to_string(line))
+                      .what();
   }
-  return n_visgpus;
 }
-
-}  // namespace common
-}  // namespace xgboost
+}  // namespace dh
diff --git a/src/common/common.h b/src/common/common.h
index 950dee5210b1..93151670b7be 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023 by XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
  * \file common.h
  * \brief Common utilities
  */
@@ -19,9 +19,8 @@
 #include "xgboost/base.h"     // for XGBOOST_DEVICE
 #include "xgboost/logging.h"  // for LOG, LOG_FATAL, LogMessageFatal
 
+// magic to define functions based on the compiler.
 #if defined(__CUDACC__)
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
 
 #define WITH_CUDA() true
 
@@ -31,23 +30,20 @@
 
 #endif  // defined(__CUDACC__)
 
+#if defined(XGBOOST_USE_CUDA)
+#include <cuda_runtime_api.h>
+#endif
+
 namespace dh {
-#if defined(__CUDACC__)
+#if defined(XGBOOST_USE_CUDA)
 /*
- * Error handling  functions
+ * Error handling functions
  */
+void ThrowOnCudaError(cudaError_t code, const char *file, int line);
+
 #define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__)
 
-inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
-                                    int line) {
-  if (code != cudaSuccess) {
-    LOG(FATAL) << thrust::system_error(code, thrust::cuda_category(),
-                                       std::string{file} + ": " +  // NOLINT
-                                       std::to_string(line)).what();
-  }
-  return code;
-}
-#endif  // defined(__CUDACC__)
+#endif  // defined(XGBOOST_USE_CUDA)
 }  // namespace dh
 
 namespace xgboost::common {
@@ -167,8 +163,6 @@ class Range {
   Iterator end_;
 };
 
-int AllVisibleGPUs();
-
 inline void AssertGPUSupport() {
 #ifndef XGBOOST_USE_CUDA
     LOG(FATAL) << "XGBoost version not compiled with GPU support.";
@@ -187,16 +181,6 @@ inline void AssertSYCLSupport() {
 #endif  // XGBOOST_USE_SYCL
 }
 
-void SetDevice(std::int32_t device);
-
-#if !defined(XGBOOST_USE_CUDA)
-inline void SetDevice(std::int32_t device) {
-  if (device >= 0) {
-    AssertGPUSupport();
-  }
-}
-#endif
-
 /**
  * @brief Last index of a group in a CSR style of index pointer.
  */
diff --git a/src/common/cuda_rt_utils.cc b/src/common/cuda_rt_utils.cc
new file mode 100644
index 000000000000..d41981d8fb18
--- /dev/null
+++ b/src/common/cuda_rt_utils.cc
@@ -0,0 +1,86 @@
+/**
+ * Copyright 2015-2024, XGBoost Contributors
+ */
+#include "cuda_rt_utils.h"
+
+#if defined(XGBOOST_USE_CUDA)
+#include <cuda_runtime_api.h>
+#endif  // defined(XGBOOST_USE_CUDA)
+
+#include <cstdint>  // for int32_t
+
+#include "common.h"  // for safe_cuda
+
+namespace xgboost::common {
+#if defined(XGBOOST_USE_CUDA)
+std::int32_t AllVisibleGPUs() {
+  int n_visgpus = 0;
+  try {
+    // When compiled with CUDA but running on CPU only device,
+    // cudaGetDeviceCount will fail.
+    dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
+  } catch (const dmlc::Error &) {
+    cudaGetLastError();  // reset error.
+    return 0;
+  }
+  return n_visgpus;
+}
+
+std::int32_t CurrentDevice() {
+  std::int32_t device = 0;
+  dh::safe_cuda(cudaGetDevice(&device));
+  return device;
+}
+
+// alternatively: `nvidia-smi -q | grep Addressing`
+bool SupportsPageableMem() {
+  std::int32_t res{0};
+  dh::safe_cuda(cudaDeviceGetAttribute(&res, cudaDevAttrPageableMemoryAccess, CurrentDevice()));
+  return res == 1;
+}
+
+bool SupportsAts() {
+  std::int32_t res{0};
+  dh::safe_cuda(cudaDeviceGetAttribute(&res, cudaDevAttrPageableMemoryAccessUsesHostPageTables,
+                                       CurrentDevice()));
+  return res == 1;
+}
+
+void CheckComputeCapability() {
+  for (std::int32_t d_idx = 0; d_idx < AllVisibleGPUs(); ++d_idx) {
+    cudaDeviceProp prop;
+    dh::safe_cuda(cudaGetDeviceProperties(&prop, d_idx));
+    std::ostringstream oss;
+    oss << "CUDA Capability Major/Minor version number: " << prop.major << "." << prop.minor
+        << " is insufficient.  Need >=3.5";
+    int failed = prop.major < 3 || (prop.major == 3 && prop.minor < 5);
+    if (failed) LOG(WARNING) << oss.str() << " for device: " << d_idx;
+  }
+}
+
+void SetDevice(std::int32_t device) {
+  if (device >= 0) {
+    dh::safe_cuda(cudaSetDevice(device));
+  }
+}
+#else
+std::int32_t AllVisibleGPUs() { return 0; }
+
+std::int32_t CurrentDevice() {
+  AssertGPUSupport();
+  return -1;
+}
+
+bool SupportsPageableMem() { return false; }
+
+bool SupportsAts() { return false; }
+
+void CheckComputeCapability() {}
+
+void SetDevice(std::int32_t device) {
+  if (device >= 0) {
+    AssertGPUSupport();
+  }
+}
+#endif  // !defined(XGBOOST_USE_CUDA)
+}  // namespace xgboost::common
diff --git a/src/common/cuda_rt_utils.h b/src/common/cuda_rt_utils.h
new file mode 100644
index 000000000000..210f1e07d7f8
--- /dev/null
+++ b/src/common/cuda_rt_utils.h
@@ -0,0 +1,58 @@
+/**
+ * Copyright 2024, XGBoost contributors
+ */
+#pragma once
+#include <cstdint>  // for int32_t
+
+#if defined(XGBOOST_USE_NVTX)
+#include <nvtx3/nvtx3.hpp>
+#endif  // defined(XGBOOST_USE_NVTX)
+
+namespace xgboost::common {
+std::int32_t AllVisibleGPUs();
+
+std::int32_t CurrentDevice();
+
+// Whether the device supports coherently accessing pageable memory without calling
+// `cudaHostRegister` on it
+bool SupportsPageableMem();
+
+// Address Translation Service (ATS)
+bool SupportsAts();
+
+void CheckComputeCapability();
+
+void SetDevice(std::int32_t device);
+
+struct NvtxDomain {
+  static constexpr char const *name{"libxgboost"};  // NOLINT
+};
+
+#if defined(XGBOOST_USE_NVTX)
+using NvtxScopedRange = ::nvtx3::scoped_range_in<NvtxDomain>;
+using NvtxEventAttr = ::nvtx3::event_attributes;
+using NvtxRgb = ::nvtx3::rgb;
+#else
+class NvtxScopedRange {
+ public:
+  template <typename... Args>
+  explicit NvtxScopedRange(Args &&...) {}
+};
+class NvtxEventAttr {
+ public:
+  template <typename... Args>
+  explicit NvtxEventAttr(Args &&...) {}
+};
+class NvtxRgb {
+ public:
+  template <typename... Args>
+  explicit NvtxRgb(Args &&...) {}
+};
+#endif  // defined(XGBOOST_USE_NVTX)
+}  // namespace xgboost::common
+
+#if defined(XGBOOST_USE_NVTX)
+#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::common::NvtxDomain)
+#else
+#define xgboost_NVTX_FN_RANGE()
+#endif  // defined(XGBOOST_USE_NVTX)
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 1754c9507036..34faa4eb013f 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -157,18 +157,6 @@ inline size_t MaxSharedMemoryOptin(int device_idx) {
   return static_cast<std::size_t>(max_shared_memory);
 }
 
-inline void CheckComputeCapability() {
-  for (int d_idx = 0; d_idx < xgboost::common::AllVisibleGPUs(); ++d_idx) {
-    cudaDeviceProp prop;
-    safe_cuda(cudaGetDeviceProperties(&prop, d_idx));
-    std::ostringstream oss;
-    oss << "CUDA Capability Major/Minor version number: " << prop.major << "."
-        << prop.minor << " is insufficient.  Need >=3.5";
-    int failed = prop.major < 3 || (prop.major == 3 && prop.minor < 5);
-    if (failed) LOG(WARNING) << oss.str() << " for device: " << d_idx;
-  }
-}
-
 XGBOOST_DEV_INLINE void AtomicOrByte(unsigned int *__restrict__ buffer,
                                      size_t ibyte, unsigned char b) {
   atomicOr(&buffer[ibyte / sizeof(unsigned int)],
@@ -273,13 +261,15 @@ void Iota(Container array, cudaStream_t stream) {
 }
 
 // dh::DebugSyncDevice(__FILE__, __LINE__);
-inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
-  if (file != "" && line != -1) {
-    auto rank = xgboost::collective::GetRank();
-    LOG(DEBUG) << "R:" << rank << ": " << file << ":" << line;
+inline void DebugSyncDevice(char const *file = __builtin_FILE(), int32_t line = __builtin_LINE()) {
+  {
+    auto err = cudaDeviceSynchronize();
+    ThrowOnCudaError(err, file, line);
+  }
+  {
+    auto err = cudaGetLastError();
+    ThrowOnCudaError(err, file, line);
   }
-  safe_cuda(cudaDeviceSynchronize());
-  safe_cuda(cudaGetLastError());
 }
 
 // Faster to instantiate than caching_device_vector and invokes no synchronisation
@@ -510,7 +500,7 @@ xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
 
 template <typename T>
 xgboost::common::Span<T> ToSpan(DeviceUVector<T> &vec) {
-  return {thrust::raw_pointer_cast(vec.data()), vec.size()};
+  return {vec.data(), vec.size()};
 }
 
 // thrust begin, similiar to std::begin
diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
index 35386856cc9c..f99b7c3fe1f0 100644
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@@ -284,47 +284,64 @@ class LoggingResource : public rmm::mr::device_memory_resource {
 
 LoggingResource *GlobalLoggingResource();
 
+#endif  // defined(XGBOOST_USE_RMM)
+
 /**
- * @brief Container class that doesn't initialize the data.
+ * @brief Container class that doesn't initialize the data when RMM is used.
  */
 template <typename T>
-class DeviceUVector : public rmm::device_uvector<T> {
-  using Super = rmm::device_uvector<T>;
+class DeviceUVector {
+ private:
+#if defined(XGBOOST_USE_RMM)
+  rmm::device_uvector<T> data_{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()};
+#else
+  ::dh::device_vector<T> data_;
+#endif  // defined(XGBOOST_USE_RMM)
 
  public:
-  DeviceUVector() : Super{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {}
+  using value_type = T;                        // NOLINT
+  using pointer = value_type *;                // NOLINT
+  using const_pointer = value_type const *;    // NOLINT
+  using reference = value_type &;              // NOLINT
+  using const_reference = value_type const &;  // NOLINT
 
-  void Resize(std::size_t n) { Super::resize(n, rmm::cuda_stream_per_thread); }
-  void Resize(std::size_t n, T const &v) {
+ public:
+  DeviceUVector() = default;
+  DeviceUVector(DeviceUVector const &that) = delete;
+  DeviceUVector &operator=(DeviceUVector const &that) = delete;
+  DeviceUVector(DeviceUVector &&that) = default;
+  DeviceUVector &operator=(DeviceUVector &&that) = default;
+
+  void resize(std::size_t n) {  // NOLINT
+#if defined(XGBOOST_USE_RMM)
+    data_.resize(n, rmm::cuda_stream_per_thread);
+#else
+    data_.resize(n);
+#endif
+  }
+  void resize(std::size_t n, T const &v) {  // NOLINT
+#if defined(XGBOOST_USE_RMM)
     auto orig = this->size();
-    Super::resize(n, rmm::cuda_stream_per_thread);
+    data_.resize(n, rmm::cuda_stream_per_thread);
     if (orig < n) {
       thrust::fill(rmm::exec_policy_nosync{}, this->begin() + orig, this->end(), v);
     }
+#else
+    data_.resize(n, v);
+#endif
   }
+  [[nodiscard]] std::size_t size() const { return data_.size(); }  // NOLINT
 
- private:
-  // undefined private, cannot be accessed.
-  void resize(std::size_t n, rmm::cuda_stream_view stream);  // NOLINT
-};
-
-#else
+  [[nodiscard]] auto begin() { return data_.begin(); }  // NOLINT
+  [[nodiscard]] auto end() { return data_.end(); }      // NOLINT
 
-/**
- * @brief Without RMM, the initialization will happen.
- */
-template <typename T>
-class DeviceUVector : public thrust::device_vector<T, XGBDeviceAllocator<T>> {
-  using Super = thrust::device_vector<T, XGBDeviceAllocator<T>>;
+  [[nodiscard]] auto begin() const { return this->cbegin(); }  // NOLINT
+  [[nodiscard]] auto end() const { return this->cend(); }      // NOLINT
 
- public:
-  void Resize(std::size_t n) { Super::resize(n); }
-  void Resize(std::size_t n, T const &v) { Super::resize(n, v); }
+  [[nodiscard]] auto cbegin() const { return data_.cbegin(); }  // NOLINT
+  [[nodiscard]] auto cend() const { return data_.cend(); }      // NOLINT
 
- private:
-  // undefined private, cannot be accessed.
-  void resize(std::size_t n, T const &v = T{});  // NOLINT
+  [[nodiscard]] auto data() { return thrust::raw_pointer_cast(data_.data()); }        // NOLINT
+  [[nodiscard]] auto data() const { return thrust::raw_pointer_cast(data_.data()); }  // NOLINT
 };
-
-#endif  // defined(XGBOOST_USE_RMM)
 }  // namespace dh
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index 16a1aa027f09..c17a2b4d3eba 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -29,7 +29,7 @@ class HostDeviceVectorImpl {
     if (device.IsCUDA()) {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
-      data_d_->Resize(size, v);
+      data_d_->resize(size, v);
     } else {
       data_h_.resize(size, v);
     }
@@ -67,12 +67,12 @@ class HostDeviceVectorImpl {
 
   T* DevicePointer() {
     LazySyncDevice(GPUAccess::kWrite);
-    return thrust::raw_pointer_cast(data_d_->data());
+    return data_d_->data();
   }
 
   const T* ConstDevicePointer() {
     LazySyncDevice(GPUAccess::kRead);
-    return thrust::raw_pointer_cast(data_d_->data());
+    return data_d_->data();
   }
 
   common::Span<T> DeviceSpan() {
@@ -181,7 +181,7 @@ class HostDeviceVectorImpl {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
       auto old_size = data_d_->size();
-      data_d_->Resize(new_size, std::forward<U>(args)...);
+      data_d_->resize(new_size, std::forward<U>(args)...);
     } else {
       // resize on host
       LazySyncHost(GPUAccess::kNone);
@@ -200,8 +200,8 @@ class HostDeviceVectorImpl {
     gpu_access_ = access;
     if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
     SetDevice();
-    dh::safe_cuda(cudaMemcpy(data_h_.data(), thrust::raw_pointer_cast(data_d_->data()),
-                             data_d_->size() * sizeof(T), cudaMemcpyDeviceToHost));
+    dh::safe_cuda(cudaMemcpy(data_h_.data(), data_d_->data(), data_d_->size() * sizeof(T),
+                             cudaMemcpyDeviceToHost));
   }
 
   void LazySyncDevice(GPUAccess access) {
@@ -214,9 +214,8 @@ class HostDeviceVectorImpl {
     // data is on the host
     LazyResizeDevice(data_h_.size());
     SetDevice();
-    dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), data_h_.data(),
-                                  data_d_->size() * sizeof(T), cudaMemcpyHostToDevice,
-                                  dh::DefaultStream()));
+    dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), data_h_.data(), data_d_->size() * sizeof(T),
+                                  cudaMemcpyHostToDevice, dh::DefaultStream()));
     gpu_access_ = access;
   }
 
@@ -260,7 +259,7 @@ class HostDeviceVectorImpl {
   void LazyResizeDevice(size_t new_size) {
     if (data_d_ && new_size == data_d_->size()) { return; }
     SetDevice();
-    data_d_->Resize(new_size);
+    data_d_->resize(new_size);
   }
 
   void SetDevice() {
diff --git a/src/common/io.cc b/src/common/io.cc
index 1715669b091a..4bc8d9de4f53 100644
--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -1,26 +1,21 @@
 /**
- * Copyright 2019-2023, by XGBoost Contributors
+ * Copyright 2019-2024, by XGBoost Contributors
  */
-#if !defined(NOMINMAX) && defined(_WIN32)
-#define NOMINMAX
-#endif  // !defined(NOMINMAX)
+#if defined(__unix__) || defined(__APPLE__)
 
-#if !defined(xgboost_IS_WIN)
+#include <fcntl.h>     // for open, O_RDONLY
+#include <sys/mman.h>  // for mmap, mmap64, munmap, madvise
+#include <unistd.h>    // for close, getpagesize
 
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#define xgboost_IS_WIN 1
-#endif  // defined(_MSC_VER) || defined(__MINGW32__)
+#else
 
-#endif  // !defined(xgboost_IS_WIN)
+#include <xgboost/windefs.h>
 
-#if defined(__unix__) || defined(__APPLE__)
-#include <fcntl.h>     // for open, O_RDONLY
-#include <sys/mman.h>  // for mmap, mmap64, munmap
-#include <unistd.h>    // for close, getpagesize
-#elif defined(xgboost_IS_WIN)
-#define WIN32_LEAN_AND_MEAN
+#if defined(xgboost_IS_WIN)
 #include <windows.h>
-#endif  // defined(__unix__)
+#endif  // defined(xgboost_IS_WIN)
+
+#endif  // defined(__unix__) || defined(__APPLE__)
 
 #include <algorithm>     // for copy, transform
 #include <cctype>        // for tolower
@@ -31,8 +26,7 @@
 #include <filesystem>    // for filesystem, weakly_canonical
 #include <fstream>       // for ifstream
 #include <iterator>      // for distance
-#include <limits>        // for numeric_limits
-#include <memory>        // for unique_ptr
+#include <memory>        // for unique_ptr, make_unique
 #include <string>        // for string
 #include <system_error>  // for error_code, system_category
 #include <utility>       // for move
@@ -40,7 +34,12 @@
 
 #include "io.h"
 #include "xgboost/collective/socket.h"  // for LastError
-#include "xgboost/logging.h"
+#include "xgboost/logging.h"            // for CHECK_LE
+#include "xgboost/string_view.h"        // for StringView
+
+#if !defined(__linux__) && !defined(__GLIBC__) && !defined(xgboost_IS_WIN)
+#include <limits>  // for numeric_limits
+#endif
 
 namespace xgboost::common {
 size_t PeekableInStream::Read(void* dptr, size_t size) {
@@ -182,39 +181,9 @@ std::string FileExtension(std::string fname, bool lower) {
 // NVCC 11.8 doesn't allow `noexcept(false) = default` altogether.
 ResourceHandler::~ResourceHandler() noexcept(false) {}  // NOLINT
 
-struct MMAPFile {
-#if defined(xgboost_IS_WIN)
-  HANDLE fd{INVALID_HANDLE_VALUE};
-  HANDLE file_map{INVALID_HANDLE_VALUE};
-#else
-  std::int32_t fd{0};
-#endif
-  std::byte* base_ptr{nullptr};
-  std::size_t base_size{0};
-  std::size_t delta{0};
-  std::string path;
-
-  MMAPFile() = default;
-
-#if defined(xgboost_IS_WIN)
-  MMAPFile(HANDLE fd, HANDLE fm, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
-           std::string path)
-      : fd{fd},
-        file_map{fm},
-        base_ptr{base_ptr},
-        base_size{base_size},
-        delta{delta},
-        path{std::move(path)} {}
-#else
-  MMAPFile(std::int32_t fd, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
-           std::string path)
-      : fd{fd}, base_ptr{base_ptr}, base_size{base_size}, delta{delta}, path{std::move(path)} {}
-#endif
-};
-
-std::unique_ptr<MMAPFile> Open(std::string path, std::size_t offset, std::size_t length) {
+MMAPFile* detail::OpenMmap(std::string path, std::size_t offset, std::size_t length) {
   if (length == 0) {
-    return std::make_unique<MMAPFile>();
+    return new MMAPFile{};
   }
 
 #if defined(xgboost_IS_WIN)
@@ -234,10 +203,8 @@ std::unique_ptr<MMAPFile> Open(std::string path, std::size_t offset, std::size_t
 #if defined(__linux__) || defined(__GLIBC__)
   int prot{PROT_READ};
   ptr = reinterpret_cast<std::byte*>(mmap64(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
-  madvise(ptr, view_size, MADV_WILLNEED);
   CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  auto handle =
-      std::make_unique<MMAPFile>(fd, ptr, view_size, offset - view_start, std::move(path));
+  auto handle = new MMAPFile{fd, ptr, view_size, offset - view_start, std::move(path)};
 #elif defined(xgboost_IS_WIN)
   auto file_size = GetFileSize(fd, nullptr);
   DWORD access = PAGE_READONLY;
@@ -248,55 +215,62 @@ std::unique_ptr<MMAPFile> Open(std::string path, std::size_t offset, std::size_t
   CHECK(map_file) << "Failed to map: " << path << ". " << SystemErrorMsg();
   ptr = reinterpret_cast<std::byte*>(MapViewOfFile(map_file, access, hoff, loff, view_size));
   CHECK_NE(ptr, nullptr) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  auto handle = std::make_unique<MMAPFile>(fd, map_file, ptr, view_size, offset - view_start,
-                                           std::move(path));
+  auto handle = new MMAPFile{fd, map_file, ptr, view_size, offset - view_start, std::move(path)};
 #else
   CHECK_LE(offset, std::numeric_limits<off_t>::max())
       << "File size has exceeded the limit on the current system.";
   int prot{PROT_READ};
   ptr = reinterpret_cast<std::byte*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
   CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  auto handle =
-      std::make_unique<MMAPFile>(fd, ptr, view_size, offset - view_start, std::move(path));
-#endif  // defined(__linux__)
+  auto handle = new MMAPFile{fd, ptr, view_size, offset - view_start, std::move(path)};
+#endif  // defined(__linux__) || defined(__GLIBC__)
 
   return handle;
 }
 
-MmapResource::MmapResource(std::string path, std::size_t offset, std::size_t length)
-    : ResourceHandler{kMmap}, handle_{Open(std::move(path), offset, length)}, n_{length} {}
-
-MmapResource::~MmapResource() noexcept(false) {
-  if (!handle_) {
+void detail::CloseMmap(MMAPFile* handle) {
+  if (!handle) {
     return;
   }
 #if defined(xgboost_IS_WIN)
-  if (handle_->base_ptr) {
-    CHECK(UnmapViewOfFile(handle_->base_ptr)) "Faled to call munmap: " << SystemErrorMsg();
+  if (handle->base_ptr) {
+    CHECK(UnmapViewOfFile(handle->base_ptr)) "Faled to call munmap: " << SystemErrorMsg();
   }
-  if (handle_->fd != INVALID_HANDLE_VALUE) {
-    CHECK(CloseHandle(handle_->fd)) << "Failed to close handle: " << SystemErrorMsg();
+  if (handle->fd != INVALID_HANDLE_VALUE) {
+    CHECK(CloseHandle(handle->fd)) << "Failed to close handle: " << SystemErrorMsg();
   }
-  if (handle_->file_map != INVALID_HANDLE_VALUE) {
-    CHECK(CloseHandle(handle_->file_map)) << "Failed to close mapping object: " << SystemErrorMsg();
+  if (handle->file_map != INVALID_HANDLE_VALUE) {
+    CHECK(CloseHandle(handle->file_map)) << "Failed to close mapping object: " << SystemErrorMsg();
   }
 #else
-  if (handle_->base_ptr) {
-    CHECK_NE(munmap(handle_->base_ptr, handle_->base_size), -1)
-        << "Faled to call munmap: " << handle_->path << ". " << SystemErrorMsg();
+  if (handle->base_ptr) {
+    CHECK_NE(munmap(handle->base_ptr, handle->base_size), -1)
+        << "Faled to call munmap: `" << handle->path << "`. " << SystemErrorMsg();
   }
-  if (handle_->fd != 0) {
-    CHECK_NE(close(handle_->fd), -1)
-        << "Faled to close: " << handle_->path << ". " << SystemErrorMsg();
+  if (handle->fd != 0) {
+    CHECK_NE(close(handle->fd), -1)
+        << "Faled to close: `" << handle->path << "`. " << SystemErrorMsg();
   }
 #endif
+  delete handle;
 }
 
+MmapResource::MmapResource(StringView path, std::size_t offset, std::size_t length)
+    : ResourceHandler{kMmap},
+      handle_{detail::OpenMmap(std::string{path}, offset, length), detail::CloseMmap},
+      n_{length} {
+#if defined(__unix__) || defined(__APPLE__)
+  madvise(handle_->base_ptr, handle_->base_size, MADV_WILLNEED);
+#endif  // defined(__unix__) || defined(__APPLE__)
+}
+
+MmapResource::~MmapResource() noexcept(false) = default;
+
 [[nodiscard]] void* MmapResource::Data() {
   if (!handle_) {
     return nullptr;
   }
-  return handle_->base_ptr + handle_->delta;
+  return this->handle_->Data();
 }
 
 [[nodiscard]] std::size_t MmapResource::Size() const { return n_; }
@@ -329,7 +303,3 @@ AlignedMemWriteStream::~AlignedMemWriteStream() = default;
   return this->pimpl_->Tell();
 }
 }  // namespace xgboost::common
-
-#if defined(xgboost_IS_WIN)
-#undef xgboost_IS_WIN
-#endif  // defined(xgboost_IS_WIN)
diff --git a/src/common/io.h b/src/common/io.h
index 198ce70143be..5f2e2833625a 100644
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -7,7 +7,11 @@
 #ifndef XGBOOST_COMMON_IO_H_
 #define XGBOOST_COMMON_IO_H_
 
-#include <dmlc/io.h>
+#include <xgboost/windefs.h>
+
+#if defined(xgboost_IS_WIN)
+#include <windows.h>
+#endif  // defined(xgboost_IS_WIN)
 
 #include <algorithm>    // for min, fill_n, copy_n
 #include <array>        // for array
@@ -15,6 +19,7 @@
 #include <cstdlib>      // for malloc, realloc, free
 #include <cstring>      // for memcpy
 #include <fstream>      // for ifstream
+#include <functional>   // for function
 #include <limits>       // for numeric_limits
 #include <memory>       // for unique_ptr
 #include <string>       // for string
@@ -23,6 +28,7 @@
 #include <vector>       // for vector
 
 #include "common.h"               // for DivRoundUp
+#include "dmlc/io.h"              // for SeekStream
 #include "xgboost/string_view.h"  // for StringView
 
 namespace xgboost::common {
@@ -224,7 +230,48 @@ inline std::string ReadAll(std::string const &path) {
   return content;
 }
 
-struct MMAPFile;
+/**
+ * @brief A handle to mmap file.
+ */
+struct MMAPFile {
+#if defined(xgboost_IS_WIN)
+  HANDLE fd{INVALID_HANDLE_VALUE};
+  HANDLE file_map{INVALID_HANDLE_VALUE};
+#else
+  std::int32_t fd{0};
+#endif  // defined(xgboost_IS_WIN)
+  std::byte* base_ptr{nullptr};
+  std::size_t base_size{0};
+  std::size_t delta{0};
+  std::string path;
+
+  MMAPFile() = default;
+
+#if defined(xgboost_IS_WIN)
+  MMAPFile(HANDLE fd, HANDLE fm, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
+           std::string path)
+      : fd{fd},
+        file_map{fm},
+        base_ptr{base_ptr},
+        base_size{base_size},
+        delta{delta},
+        path{std::move(path)} {}
+#else
+  MMAPFile(std::int32_t fd, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
+           std::string path)
+      : fd{fd}, base_ptr{base_ptr}, base_size{base_size}, delta{delta}, path{std::move(path)} {}
+#endif  // defined(xgboost_IS_WIN)
+
+  void const* Data() const { return this->base_ptr + this->delta; }
+  void* Data() { return this->base_ptr + this->delta; }
+};
+
+namespace detail {
+// call mmap
+[[nodiscard]] MMAPFile* OpenMmap(std::string path, std::size_t offset, std::size_t length);
+// close the mapped file handle.
+void CloseMmap(MMAPFile* handle);
+}  // namespace detail
 
 /**
  * @brief Handler for one-shot resource. Unlike `std::pmr::*`, the resource handler is
@@ -237,6 +284,8 @@ class ResourceHandler {
   enum Kind : std::uint8_t {
     kMalloc = 0,
     kMmap = 1,
+    kCudaMalloc = 2,
+    kCudaMmap = 3,
   };
 
  private:
@@ -251,6 +300,20 @@ class ResourceHandler {
 
   [[nodiscard]] virtual std::size_t Size() const = 0;
   [[nodiscard]] auto Type() const { return kind_; }
+  [[nodiscard]] StringView TypeName() const {
+    switch (this->Type()) {
+      case kMalloc:
+        return "Malloc";
+      case kMmap:
+        return "Mmap";
+      case kCudaMalloc:
+        return "CudaMalloc";
+      case kCudaMmap:
+        return "CudaMmap";
+    }
+    LOG(FATAL) << "Unreachable.";
+    return {};
+  }
 
   // Allow exceptions for cleaning up resource.
   virtual ~ResourceHandler() noexcept(false);
@@ -339,11 +402,11 @@ class MallocResource : public ResourceHandler {
  * @brief A class for wrapping mmap as a resource for RAII.
  */
 class MmapResource : public ResourceHandler {
-  std::unique_ptr<MMAPFile> handle_;
+  std::unique_ptr<MMAPFile, std::function<void(MMAPFile*)>> handle_;
   std::size_t n_;
 
  public:
-  MmapResource(std::string path, std::size_t offset, std::size_t length);
+  MmapResource(StringView path, std::size_t offset, std::size_t length);
   ~MmapResource() noexcept(false) override;
 
   [[nodiscard]] void* Data() override;
@@ -471,9 +534,9 @@ class PrivateMmapConstStream : public AlignedResourceReadStream {
    * @param offset    See the `offset` parameter of `mmap` for details.
    * @param length    See the `length` parameter of `mmap` for details.
    */
-  explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
+  explicit PrivateMmapConstStream(StringView path, std::size_t offset, std::size_t length)
       : AlignedResourceReadStream{std::shared_ptr<MmapResource>{  // NOLINT
-            new MmapResource{std::move(path), offset, length}}} {}
+            new MmapResource{path, offset, length}}} {}
   ~PrivateMmapConstStream() noexcept(false) override;
 };
 
diff --git a/src/common/ref_resource_view.cuh b/src/common/ref_resource_view.cuh
new file mode 100644
index 000000000000..d48b221a305d
--- /dev/null
+++ b/src/common/ref_resource_view.cuh
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2024, XGBoost Contributors
+ */
+#pragma once
+
+#include <cstddef>  // for size_t
+#include <memory>   // for make_shared
+
+#include "cuda_context.cuh"     // for CUDAContext
+#include "ref_resource_view.h"  // for RefResourceView
+#include "resource.cuh"         // for CudaAllocResource
+#include "xgboost/context.h"    // for Context
+
+namespace xgboost::common {
+/**
+ * @brief Make a fixed size `RefResourceView` with cudaMalloc resource.
+ */
+template <typename T>
+[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const*,
+                                                            std::size_t n_elements) {
+  auto resource = std::make_shared<common::CudaMallocResource>(n_elements * sizeof(T));
+  auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
+  return ref;
+}
+
+template <typename T>
+[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx,
+                                                            std::size_t n_elements, T const& init) {
+  auto ref = MakeFixedVecWithCudaMalloc<T>(ctx, n_elements);
+  thrust::fill_n(ctx->CUDACtx()->CTP(), ref.data(), ref.size(), init);
+  return ref;
+}
+}  // namespace xgboost::common
diff --git a/src/common/ref_resource_view.h b/src/common/ref_resource_view.h
index 61adfdb7bea8..81058d923d3b 100644
--- a/src/common/ref_resource_view.h
+++ b/src/common/ref_resource_view.h
@@ -43,24 +43,16 @@ class RefResourceView {
   }
 
  public:
-  RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem)
-      : ptr_{ptr}, size_{n}, mem_{std::move(mem)} {
-    CHECK_GE(mem_->Size(), n);
-  }
   /**
    * @brief Construct a view on ptr with length n. The ptr is held by the mem resource.
    *
    * @param ptr  The pointer to view.
    * @param n    The length of the view.
    * @param mem  The owner of the pointer.
-   * @param init Initialize the view with this value.
    */
-  RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem,
-                  T const& init)
-      : RefResourceView{ptr, n, mem} {
-    if (n != 0) {
-      std::fill_n(ptr_, n, init);
-    }
+  RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem)
+      : ptr_{ptr}, size_{n}, mem_{std::move(mem)} {
+    CHECK_GE(mem_->Size(), n);
   }
 
   ~RefResourceView() = default;
@@ -159,7 +151,9 @@ template <typename Vec>
 template <typename T>
 [[nodiscard]] RefResourceView<T> MakeFixedVecWithMalloc(std::size_t n_elements, T const& init) {
   auto resource = std::make_shared<common::MallocResource>(n_elements * sizeof(T));
-  return RefResourceView{resource->DataAs<T>(), n_elements, resource, init};
+  auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
+  std::fill_n(ref.data(), ref.size(), init);
+  return ref;
 }
 
 template <typename T>
diff --git a/src/common/resource.cu b/src/common/resource.cu
new file mode 100644
index 000000000000..ef662e3bd6e0
--- /dev/null
+++ b/src/common/resource.cu
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2024, XGBoost Contributors
+ */
+#include "device_helpers.cuh"  // for CurrentDevice
+#include "resource.cuh"
+#include "xgboost/string_view.h"  // for StringView
+
+namespace xgboost::common {
+CudaMmapResource::CudaMmapResource(StringView path, std::size_t offset, std::size_t length)
+    : ResourceHandler{kCudaMmap},
+      handle_{detail::OpenMmap(std::string{path}, offset, length),
+              [](MMAPFile* handle) {
+                // Don't close the mmap while CUDA kernel is running.
+                if (handle) {
+                  dh::DefaultStream().Sync();
+                }
+                detail::CloseMmap(handle);
+              }},
+      n_{length} {
+  auto device = dh::CurrentDevice();
+  dh::safe_cuda(
+      cudaMemAdvise(handle_->base_ptr, handle_->base_size, cudaMemAdviseSetReadMostly, device));
+  dh::safe_cuda(cudaMemAdvise(handle_->base_ptr, handle_->base_size,
+                              cudaMemAdviseSetPreferredLocation, device));
+  dh::safe_cuda(
+      cudaMemAdvise(handle_->base_ptr, handle_->base_size, cudaMemAdviseSetAccessedBy, device));
+  dh::safe_cuda(
+      cudaMemPrefetchAsync(handle_->base_ptr, handle_->base_size, device, dh::DefaultStream()));
+}
+
+[[nodiscard]] void* CudaMmapResource::Data() {
+  if (!handle_) {
+    return nullptr;
+  }
+  return this->handle_->Data();
+}
+
+[[nodiscard]] std::size_t CudaMmapResource::Size() const { return n_; }
+
+CudaMmapResource::~CudaMmapResource() noexcept(false) = default;
+
+PrivateCudaMmapConstStream::~PrivateCudaMmapConstStream() noexcept(false) = default;
+}  // namespace xgboost::common
diff --git a/src/common/resource.cuh b/src/common/resource.cuh
new file mode 100644
index 000000000000..e950a8d90695
--- /dev/null
+++ b/src/common/resource.cuh
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2024, XGBoost Contributors
+ */
+#pragma once
+#include <cstddef>     // for size_t
+#include <functional>  // for function
+
+#include "device_vector.cuh"      // for DeviceUVector
+#include "io.h"                   // for ResourceHandler, MMAPFile
+#include "xgboost/string_view.h"  // for StringView
+
+namespace xgboost::common {
+/**
+ * @brief Resource backed by `cudaMalloc`.
+ */
+class CudaMallocResource : public ResourceHandler {
+  dh::DeviceUVector<std::byte> storage_;
+
+  void Clear() noexcept(true) { this->Resize(0); }
+
+ public:
+  explicit CudaMallocResource(std::size_t n_bytes) : ResourceHandler{kCudaMalloc} {
+    this->Resize(n_bytes);
+  }
+  ~CudaMallocResource() noexcept(true) override { this->Clear(); }
+
+  [[nodiscard]] void* Data() override { return storage_.data(); }
+  [[nodiscard]] std::size_t Size() const override { return storage_.size(); }
+  void Resize(std::size_t n_bytes) { this->storage_.resize(n_bytes); }
+};
+
+class CudaMmapResource : public ResourceHandler {
+  std::unique_ptr<MMAPFile, std::function<void(MMAPFile*)>> handle_;
+  std::size_t n_;
+
+ public:
+  CudaMmapResource() : ResourceHandler{kCudaMmap} {}
+  CudaMmapResource(StringView path, std::size_t offset, std::size_t length);
+  ~CudaMmapResource() noexcept(false) override;
+
+  [[nodiscard]] void* Data() override;
+  [[nodiscard]] std::size_t Size() const override;
+};
+
+class PrivateCudaMmapConstStream : public AlignedResourceReadStream {
+ public:
+  explicit PrivateCudaMmapConstStream(StringView path, std::size_t offset, std::size_t length)
+      : AlignedResourceReadStream{
+            std::shared_ptr<CudaMmapResource>{new CudaMmapResource{path, offset, length}}} {}
+  ~PrivateCudaMmapConstStream() noexcept(false) override;
+};
+}  // namespace xgboost::common
diff --git a/src/common/timer.cc b/src/common/timer.cc
index 9b1f49fbd5c8..0b55d1623dbc 100644
--- a/src/common/timer.cc
+++ b/src/common/timer.cc
@@ -6,9 +6,10 @@
 #include <utility>
 
 #include "../collective/communicator-inl.h"
+#include "cuda_rt_utils.h"
 
 #if defined(XGBOOST_USE_NVTX)
-#include <nvtx3/nvToolsExt.h>
+#include <nvtx3/nvtx3.hpp>
 #endif  // defined(XGBOOST_USE_NVTX)
 
 namespace xgboost::common {
@@ -17,8 +18,8 @@ void Monitor::Start(std::string const &name) {
     auto &stats = statistics_map_[name];
     stats.timer.Start();
 #if defined(XGBOOST_USE_NVTX)
-    std::string nvtx_name = "xgboost::" + label_ + "::" + name;
-    stats.nvtx_id = nvtxRangeStartA(nvtx_name.c_str());
+    auto range_handle = nvtx3::start_range_in<common::NvtxDomain>(label_ + "::" + name);
+    stats.nvtx_id = range_handle.get_value();
 #endif  // defined(XGBOOST_USE_NVTX)
   }
 }
@@ -29,34 +30,32 @@ void Monitor::Stop(const std::string &name) {
     stats.timer.Stop();
     stats.count++;
 #if defined(XGBOOST_USE_NVTX)
-    nvtxRangeEnd(stats.nvtx_id);
+    nvtx3::end_range_in<common::NvtxDomain>(nvtx3::range_handle{stats.nvtx_id});
 #endif  // defined(XGBOOST_USE_NVTX)
   }
 }
 
-void Monitor::PrintStatistics(StatMap const& statistics) const {
+void Monitor::PrintStatistics(StatMap const &statistics) const {
   for (auto &kv : statistics) {
     if (kv.second.first == 0) {
-      LOG(WARNING) <<
-          "Timer for " << kv.first << " did not get stopped properly.";
+      LOG(WARNING) << "Timer for " << kv.first << " did not get stopped properly.";
       continue;
     }
-    LOG(CONSOLE) << kv.first << ": " << static_cast<double>(kv.second.second) / 1e+6
-                 << "s, " << kv.second.first << " calls @ "
-                 << kv.second.second
-                 << "us" << std::endl;
+    LOG(CONSOLE) << kv.first << ": " << static_cast<double>(kv.second.second) / 1e+6 << "s, "
+                 << kv.second.first << " calls @ " << kv.second.second << "us" << std::endl;
   }
 }
 
 void Monitor::Print() const {
-  if (!ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) { return; }
+  if (!ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
+    return;
+  }
   auto rank = collective::GetRank();
   StatMap stat_map;
   for (auto const &kv : statistics_map_) {
     stat_map[kv.first] = std::make_pair(
-        kv.second.count, std::chrono::duration_cast<std::chrono::microseconds>(
-                             kv.second.timer.elapsed)
-                             .count());
+        kv.second.count,
+        std::chrono::duration_cast<std::chrono::microseconds>(kv.second.timer.elapsed).count());
   }
   if (stat_map.empty()) {
     return;
diff --git a/src/context.cc b/src/context.cc
index ef7110e7ce19..19060d5fc830 100644
--- a/src/context.cc
+++ b/src/context.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023 by XGBoost Contributors
+ * Copyright 2014-2024, XGBoost Contributors
  *
  * \brief Context object used for controlling runtime parameters.
  */
@@ -11,8 +11,9 @@
 #include <optional>   // for optional
 #include <regex>      // for regex_replace, regex_match
 
-#include "common/common.h"     // AssertGPUSupport
-#include "common/error_msg.h"  // WarnDeprecatedGPUId
+#include "common/common.h"         // AssertGPUSupport
+#include "common/cuda_rt_utils.h"  // for AllVisibleGPUs
+#include "common/error_msg.h"      // WarnDeprecatedGPUId
 #include "common/threading_utils.h"
 #include "xgboost/string_view.h"
 
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 81656284eb49..fc28b7c56f12 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -11,8 +11,9 @@
 #include "../common/categorical.h"
 #include "../common/cuda_context.cuh"
 #include "../common/hist_util.cuh"
-#include "../common/transform_iterator.h"  // MakeIndexTransformIter
-#include "device_adapter.cuh"              // for NoInfInData
+#include "../common/ref_resource_view.cuh"  // for MakeFixedVecWithCudaMalloc
+#include "../common/transform_iterator.h"   // MakeIndexTransformIter
+#include "device_adapter.cuh"               // for NoInfInData
 #include "ellpack_page.cuh"
 #include "ellpack_page.h"
 #include "gradient_index.h"
@@ -43,21 +44,19 @@ __global__ void CompressBinEllpackKernel(
     common::CompressedBufferWriter wr,
     common::CompressedByteT* __restrict__ buffer,  // gidx_buffer
     const size_t* __restrict__ row_ptrs,           // row offset of input data
-    const Entry* __restrict__ entries,      // One batch of input data
-    const float* __restrict__ cuts,         // HistogramCuts::cut_values_
-    const uint32_t* __restrict__ cut_ptrs,  // HistogramCuts::cut_ptrs_
+    const Entry* __restrict__ entries,             // One batch of input data
+    const float* __restrict__ cuts,                // HistogramCuts::cut_values_
+    const uint32_t* __restrict__ cut_ptrs,         // HistogramCuts::cut_ptrs_
     common::Span<FeatureType const> feature_types,
-    size_t base_row,                        // batch_row_begin
-    size_t n_rows,
-    size_t row_stride,
-    unsigned int null_gidx_value) {
+    size_t base_row,  // batch_row_begin
+    size_t n_rows, size_t row_stride, std::uint32_t null_gidx_value) {
   size_t irow = threadIdx.x + blockIdx.x * blockDim.x;
   int ifeature = threadIdx.y + blockIdx.y * blockDim.y;
   if (irow >= n_rows || ifeature >= row_stride) {
     return;
   }
   int row_length = static_cast<int>(row_ptrs[irow + 1] - row_ptrs[irow]);
-  unsigned int bin = null_gidx_value;
+  std::uint32_t bin = null_gidx_value;
   if (ifeature < row_length) {
     Entry entry = entries[row_ptrs[irow] - row_ptrs[0] + ifeature];
     int feature = entry.index;
@@ -89,25 +88,23 @@ __global__ void CompressBinEllpackKernel(
 }
 
 // Construct an ELLPACK matrix with the given number of empty rows.
-EllpackPageImpl::EllpackPageImpl(DeviceOrd device,
+EllpackPageImpl::EllpackPageImpl(Context const* ctx,
                                  std::shared_ptr<common::HistogramCuts const> cuts, bool is_dense,
                                  bst_idx_t row_stride, bst_idx_t n_rows)
     : is_dense(is_dense), cuts_(std::move(cuts)), row_stride{row_stride}, n_rows{n_rows} {
   monitor_.Init("ellpack_page");
-  dh::safe_cuda(cudaSetDevice(device.ordinal));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
 
-  monitor_.Start("InitCompressedData");
-  this->InitCompressedData(device);
-  monitor_.Stop("InitCompressedData");
+  this->InitCompressedData(ctx);
 }
 
-EllpackPageImpl::EllpackPageImpl(DeviceOrd device,
+EllpackPageImpl::EllpackPageImpl(Context const* ctx,
                                  std::shared_ptr<common::HistogramCuts const> cuts,
                                  const SparsePage& page, bool is_dense, size_t row_stride,
                                  common::Span<FeatureType const> feature_types)
     : cuts_(std::move(cuts)), is_dense(is_dense), n_rows(page.Size()), row_stride(row_stride) {
-  this->InitCompressedData(device);
-  this->CreateHistIndices(device, page, feature_types);
+  this->InitCompressedData(ctx);
+  this->CreateHistIndices(ctx->Device(), page, feature_types);
 }
 
 // Construct an ELLPACK matrix in memory.
@@ -129,9 +126,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
   }
   monitor_.Stop("Quantiles");
 
-  monitor_.Start("InitCompressedData");
-  this->InitCompressedData(ctx->Device());
-  monitor_.Stop("InitCompressedData");
+  this->InitCompressedData(ctx);
 
   dmat->Info().feature_types.SetDevice(ctx->Device());
   auto ft = dmat->Info().feature_types.ConstDeviceSpan();
@@ -234,7 +229,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
 
   auto device_accessor = dst->GetDeviceAccessor(device);
   common::CompressedBufferWriter writer(device_accessor.NumSymbols());
-  auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
+  auto d_compressed_buffer = dst->gidx_buffer.data();
 
   // We redirect the scan output into this functor to do the actual writing
   WriteCompressedEllpackFunctor<AdapterBatchT> functor(
@@ -275,7 +270,7 @@ void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t
   // Write the null values
   auto device_accessor = dst->GetDeviceAccessor(device);
   common::CompressedBufferWriter writer(device_accessor.NumSymbols());
-  auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
+  auto d_compressed_buffer = dst->gidx_buffer.data();
   auto row_stride = dst->row_stride;
   dh::LaunchN(row_stride * dst->n_rows, [=] __device__(size_t idx) {
     // For some reason this variable got captured as const
@@ -290,20 +285,20 @@ void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t
 }
 
 template <typename AdapterBatch>
-EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
-                                 common::Span<size_t> row_counts_span,
+EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing,
+                                 bool is_dense, common::Span<size_t> row_counts_span,
                                  common::Span<FeatureType const> feature_types, size_t row_stride,
                                  size_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts) {
-  dh::safe_cuda(cudaSetDevice(device.ordinal));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
 
-  *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
-  CopyDataToEllpack(batch, feature_types, this, device, missing);
-  WriteNullValues(this, device, row_counts_span);
+  *this = EllpackPageImpl(ctx, cuts, is_dense, row_stride, n_rows);
+  CopyDataToEllpack(batch, feature_types, this, ctx->Device(), missing);
+  WriteNullValues(this, ctx->Device(), row_counts_span);
 }
 
 #define ELLPACK_BATCH_SPECIALIZE(__BATCH_T)                                                \
   template EllpackPageImpl::EllpackPageImpl(                                               \
-      __BATCH_T batch, float missing, DeviceOrd device, bool is_dense,                     \
+      Context const* ctx, __BATCH_T batch, float missing, bool is_dense,                   \
       common::Span<size_t> row_counts_span, common::Span<FeatureType const> feature_types, \
       size_t row_stride, size_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts);
 
@@ -365,12 +360,10 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
   row_stride = *std::max_element(it, it + page.Size());
 
   CHECK(ctx->IsCUDA());
-  monitor_.Start("InitCompressedData");
-  InitCompressedData(ctx->Device());
-  monitor_.Stop("InitCompressedData");
+  InitCompressedData(ctx);
 
   // copy gidx
-  common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
+  common::CompressedByteT* d_compressed_buffer = gidx_buffer.data();
   dh::device_vector<size_t> row_ptr(page.row_ptr.size());
   auto d_row_ptr = dh::ToSpan(row_ptr);
   dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
@@ -389,20 +382,20 @@ struct CopyPage {
   // The number of elements to skip.
   size_t offset;
 
-  CopyPage(EllpackPageImpl *dst, EllpackPageImpl const *src, size_t offset)
-      : cbw{dst->NumSymbols()}, dst_data_d{dst->gidx_buffer.DevicePointer()},
-        src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
+  CopyPage(EllpackPageImpl* dst, EllpackPageImpl const* src, size_t offset)
+      : cbw{dst->NumSymbols()},
+        dst_data_d{dst->gidx_buffer.data()},
+        src_iterator_d{src->gidx_buffer.data(), src->NumSymbols()},
         offset(offset) {}
 
   __device__ void operator()(size_t element_id) {
-    cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id],
-                          element_id + offset);
+    cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id], element_id + offset);
   }
 };
 
 // Copy the data from the given EllpackPage to the current page.
-size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size_t offset) {
-  monitor_.Start("Copy");
+size_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page, bst_idx_t offset) {
+  monitor_.Start(__func__);
   bst_idx_t num_elements = page->n_rows * page->row_stride;
   CHECK_EQ(row_stride, page->row_stride);
   CHECK_EQ(NumSymbols(), page->NumSymbols());
@@ -411,10 +404,8 @@ size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size
     LOG(FATAL) << "Concatenating the same Ellpack.";
     return this->n_rows * this->row_stride;
   }
-  gidx_buffer.SetDevice(device);
-  page->gidx_buffer.SetDevice(device);
-  dh::LaunchN(num_elements, CopyPage(this, page, offset));
-  monitor_.Stop("Copy");
+  dh::LaunchN(num_elements, ctx->CUDACtx()->Stream(), CopyPage{this, page, offset});
+  monitor_.Stop(__func__);
   return num_elements;
 }
 
@@ -423,8 +414,8 @@ struct CompactPage {
   common::CompressedBufferWriter cbw;
   common::CompressedByteT* dst_data_d;
   common::CompressedIterator<uint32_t> src_iterator_d;
-  /*! \brief An array that maps the rows from the full DMatrix to the compacted
-   * page.
+  /**
+   * @brief An array that maps the rows from the full DMatrix to the compacted page.
    *
    * The total size is the number of rows in the original, uncompacted DMatrix.
    * Elements are the row ids in the compacted page. Rows not needed are set to
@@ -438,24 +429,24 @@ struct CompactPage {
   size_t base_rowid;
   size_t row_stride;
 
-  CompactPage(EllpackPageImpl* dst, EllpackPageImpl const* src,
-              common::Span<size_t> row_indexes)
+  CompactPage(EllpackPageImpl* dst, EllpackPageImpl const* src, common::Span<size_t> row_indexes)
       : cbw{dst->NumSymbols()},
-        dst_data_d{dst->gidx_buffer.DevicePointer()},
-        src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
+        dst_data_d{dst->gidx_buffer.data()},
+        src_iterator_d{src->gidx_buffer.data(), src->NumSymbols()},
         row_indexes(row_indexes),
         base_rowid{src->base_rowid},
         row_stride{src->row_stride} {}
 
-  __device__ void operator()(size_t row_id) {
+  __device__ void operator()(bst_idx_t row_id) {
     size_t src_row = base_rowid + row_id;
     size_t dst_row = row_indexes[src_row];
-    if (dst_row == SIZE_MAX) return;
+    if (dst_row == SIZE_MAX) {
+      return;
+    }
     size_t dst_offset = dst_row * row_stride;
     size_t src_offset = row_id * row_stride;
     for (size_t j = 0; j < row_stride; j++) {
-      cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j],
-                            dst_offset + j);
+      cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j], dst_offset + j);
     }
   }
 };
@@ -467,28 +458,22 @@ void EllpackPageImpl::Compact(Context const* ctx, EllpackPageImpl const* page,
   CHECK_EQ(row_stride, page->row_stride);
   CHECK_EQ(NumSymbols(), page->NumSymbols());
   CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size());
-  gidx_buffer.SetDevice(ctx->Device());
-  page->gidx_buffer.SetDevice(ctx->Device());
   auto cuctx = ctx->CUDACtx();
-  dh::LaunchN(page->n_rows, cuctx->Stream(), CompactPage(this, page, row_indexes));
+  dh::LaunchN(page->n_rows, cuctx->Stream(), CompactPage{this, page, row_indexes});
   monitor_.Stop(__func__);
 }
 
 // Initialize the buffer to stored compressed features.
-void EllpackPageImpl::InitCompressedData(DeviceOrd device) {
-  size_t num_symbols = NumSymbols();
+void EllpackPageImpl::InitCompressedData(Context const* ctx) {
+  monitor_.Start(__func__);
+  auto num_symbols = NumSymbols();
 
   // Required buffer size for storing data matrix in ELLPack format.
-  size_t compressed_size_bytes =
+  std::size_t compressed_size_bytes =
       common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols);
-  gidx_buffer.SetDevice(device);
-  // Don't call fill unnecessarily
-  if (gidx_buffer.Size() == 0) {
-    gidx_buffer.Resize(compressed_size_bytes, 0);
-  } else {
-    gidx_buffer.Resize(compressed_size_bytes, 0);
-    thrust::fill(dh::tbegin(gidx_buffer), dh::tend(gidx_buffer), 0);
-  }
+  auto init = static_cast<common::CompressedByteT>(0);
+  gidx_buffer = common::MakeFixedVecWithCudaMalloc(ctx, compressed_size_bytes, init);
+  monitor_.Stop(__func__);
 }
 
 // Compress a CSR page into ELLPACK.
@@ -496,7 +481,7 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
                                         const SparsePage& row_batch,
                                         common::Span<FeatureType const> feature_types) {
   if (row_batch.Size() == 0) return;
-  unsigned int null_gidx_value = NumSymbols() - 1;
+  std::uint32_t null_gidx_value = NumSymbols() - 1;
 
   const auto& offset_vec = row_batch.offset.ConstHostVector();
 
@@ -541,13 +526,11 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
     const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
                      common::DivRoundUp(row_stride, block3.y), 1);
     auto device_accessor = GetDeviceAccessor(device);
-    dh::LaunchKernel {grid3, block3}(
-        CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()),
-        gidx_buffer.DevicePointer(), row_ptrs.data().get(),
-        entries_d.data().get(), device_accessor.gidx_fvalue_map.data(),
-        device_accessor.feature_segments.data(), feature_types,
-        batch_row_begin, batch_nrows, row_stride,
-        null_gidx_value);
+    dh::LaunchKernel{grid3, block3}(  // NOLINT
+        CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()), gidx_buffer.data(),
+        row_ptrs.data().get(), entries_d.data().get(), device_accessor.gidx_fvalue_map.data(),
+        device_accessor.feature_segments.data(), feature_types, batch_row_begin, batch_nrows,
+        row_stride, null_gidx_value);
   }
 }
 
@@ -566,26 +549,31 @@ size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
 
 EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
     DeviceOrd device, common::Span<FeatureType const> feature_types) const {
-  gidx_buffer.SetDevice(device);
   return {device,
           cuts_,
           is_dense,
           row_stride,
           base_rowid,
           n_rows,
-          common::CompressedIterator<uint32_t>(gidx_buffer.ConstDevicePointer(),
-                                               NumSymbols()),
+          common::CompressedIterator<uint32_t>(gidx_buffer.data(), NumSymbols()),
           feature_types};
 }
+
 EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
+    Context const* ctx, std::vector<common::CompressedByteT>* h_gidx_buffer,
     common::Span<FeatureType const> feature_types) const {
+  h_gidx_buffer->resize(gidx_buffer.size());
+  CHECK_EQ(h_gidx_buffer->size(), gidx_buffer.size());
+  CHECK_NE(gidx_buffer.size(), 0);
+  dh::safe_cuda(cudaMemcpyAsync(h_gidx_buffer->data(), gidx_buffer.data(), gidx_buffer.size_bytes(),
+                                cudaMemcpyDefault, dh::DefaultStream()));
   return {DeviceOrd::CPU(),
           cuts_,
           is_dense,
           row_stride,
           base_rowid,
           n_rows,
-          common::CompressedIterator<uint32_t>(gidx_buffer.ConstHostPointer(), NumSymbols()),
+          common::CompressedIterator<uint32_t>(h_gidx_buffer->data(), NumSymbols()),
           feature_types};
 }
 }  // namespace xgboost
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index d1f9472df4c4..18b9384afbd7 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -1,23 +1,25 @@
 /**
- * Copyright 2019-2023, XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
  */
-
 #ifndef XGBOOST_DATA_ELLPACK_PAGE_CUH_
 #define XGBOOST_DATA_ELLPACK_PAGE_CUH_
 
 #include <thrust/binary_search.h>
-#include <xgboost/data.h>
 
 #include "../common/categorical.h"
 #include "../common/compressed_iterator.h"
 #include "../common/device_helpers.cuh"
 #include "../common/hist_util.h"
+#include "../common/ref_resource_view.h"  // for RefResourceView
 #include "ellpack_page.h"
+#include "xgboost/data.h"
 
 namespace xgboost {
-/** \brief Struct for accessing and manipulating an ELLPACK matrix on the
- * device. Does not own underlying memory and may be trivially copied into
- * kernels.*/
+/**
+ * @brief Struct for accessing and manipulating an ELLPACK matrix on the device.
+ *
+ * Does not own underlying memory and may be trivially copied into kernels.
+ */
 struct EllpackDeviceAccessor {
   /*! \brief Whether or not if the matrix is dense. */
   bool is_dense;
@@ -128,31 +130,31 @@ class GHistIndexMatrix;
 
 class EllpackPageImpl {
  public:
-  /*!
-   * \brief Default constructor.
+  /**
+   * @brief Default constructor.
    *
    * This is used in the external memory case. An empty ELLPACK page is constructed with its content
    * set later by the reader.
    */
   EllpackPageImpl() = default;
 
-  /*!
-   * \brief Constructor from an existing EllpackInfo.
+  /**
+   * @brief Constructor from an existing EllpackInfo.
    *
-   * This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
-   * and the given number of rows.
+   * This is used in the sampling case. The ELLPACK page is constructed from an existing
+   * Ellpack page and the given number of rows.
    */
-  EllpackPageImpl(DeviceOrd device, std::shared_ptr<common::HistogramCuts const> cuts,
+  EllpackPageImpl(Context const* ctx, std::shared_ptr<common::HistogramCuts const> cuts,
                   bool is_dense, bst_idx_t row_stride, bst_idx_t n_rows);
-  /*!
-   * \brief Constructor used for external memory.
+  /**
+   * @brief Constructor used for external memory.
    */
-  EllpackPageImpl(DeviceOrd device, std::shared_ptr<common::HistogramCuts const> cuts,
+  EllpackPageImpl(Context const* ctx, std::shared_ptr<common::HistogramCuts const> cuts,
                   const SparsePage& page, bool is_dense, size_t row_stride,
                   common::Span<FeatureType const> feature_types);
 
-  /*!
-   * \brief Constructor from an existing DMatrix.
+  /**
+   * @brief Constructor from an existing DMatrix.
    *
    * This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
    * in CSR format.
@@ -160,37 +162,39 @@ class EllpackPageImpl {
   explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);
 
   template <typename AdapterBatch>
-  explicit EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
+  explicit EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing, bool is_dense,
                            common::Span<size_t> row_counts_span,
                            common::Span<FeatureType const> feature_types, size_t row_stride,
                            size_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts);
   /**
-   * \brief Constructor from an existing CPU gradient index.
+   * @brief Constructor from an existing CPU gradient index.
    */
   explicit EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& page,
                            common::Span<FeatureType const> ft);
 
-  /*! \brief Copy the elements of the given ELLPACK page into this page.
+  /**
+   * @brief Copy the elements of the given ELLPACK page into this page.
    *
-   * @param device The GPU device to use.
+   * @param ctx The GPU context.
    * @param page The ELLPACK page to copy from.
    * @param offset The number of elements to skip before copying.
    * @returns The number of elements copied.
    */
-  size_t Copy(DeviceOrd device, EllpackPageImpl const *page, size_t offset);
+  bst_idx_t Copy(Context const* ctx, EllpackPageImpl const* page, bst_idx_t offset);
 
-  /*! \brief Compact the given ELLPACK page into the current page.
+  /**
+   * @brief Compact the given ELLPACK page into the current page.
    *
-   * @param context The GPU context.
+   * @param ctx The GPU context.
    * @param page The ELLPACK page to compact from.
    * @param row_indexes Row indexes for the compacted page.
    */
   void Compact(Context const* ctx, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
 
-  /*! \return Number of instances in the page. */
+  /** @return Number of instances in the page. */
   [[nodiscard]] bst_idx_t Size() const;
 
-  /*! \brief Set the base row id for this page. */
+  /** @brief Set the base row id for this page. */
   void SetBaseRowId(std::size_t row_id) {
     base_rowid = row_id;
   }
@@ -199,43 +203,54 @@ class EllpackPageImpl {
   [[nodiscard]] std::shared_ptr<common::HistogramCuts const> CutsShared() const { return cuts_; }
   void SetCuts(std::shared_ptr<common::HistogramCuts const> cuts) { cuts_ = cuts; }
 
-  /*! \return Estimation of memory cost of this page. */
+  /** @return Estimation of memory cost of this page. */
   static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ;
 
 
-  /*! \brief Return the total number of symbols (total number of bins plus 1 for
-   * not found). */
+  /**
+   * @brief Return the total number of symbols (total number of bins plus 1 for not
+   *        found).
+   */
   [[nodiscard]] std::size_t NumSymbols() const { return cuts_->TotalBins() + 1; }
-
+  /**
+   * @brief Get an accessor that can be passed into CUDA kernels.
+   */
   [[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
       DeviceOrd device, common::Span<FeatureType const> feature_types = {}) const;
+  /**
+   * @brief Get an accessor for host code.
+   */
   [[nodiscard]] EllpackDeviceAccessor GetHostAccessor(
+      Context const* ctx, std::vector<common::CompressedByteT>* h_gidx_buffer,
       common::Span<FeatureType const> feature_types = {}) const;
 
  private:
-  /*!
-   * \brief Compress a single page of CSR data into ELLPACK.
+  /**
+   * @brief Compress a single page of CSR data into ELLPACK.
    *
    * @param device The GPU device to use.
    * @param row_batch The CSR page.
    */
-  void CreateHistIndices(DeviceOrd device,
-                         const SparsePage& row_batch,
+  void CreateHistIndices(DeviceOrd device, const SparsePage& row_batch,
                          common::Span<FeatureType const> feature_types);
-  /*!
-   * \brief Initialize the buffer to store compressed features.
+  /**
+   * @brief Initialize the buffer to store compressed features.
    */
-  void InitCompressedData(DeviceOrd device);
+  void InitCompressedData(Context const* ctx);
 
  public:
-  /*! \brief Whether or not if the matrix is dense. */
+  /** @brief Whether or not if the matrix is dense. */
   bool is_dense;
-  /*! \brief Row length for ELLPACK. */
+  /** @brief Row length for ELLPACK. */
   bst_idx_t row_stride;
   bst_idx_t base_rowid{0};
-  bst_idx_t n_rows{};
-  /*! \brief global index of histogram, which is stored in ELLPACK format. */
-  HostDeviceVector<common::CompressedByteT> gidx_buffer;
+  bst_idx_t n_rows{0};
+  /**
+   * @brief Index of the gradient histogram, which is stored in ELLPACK format.
+   *
+   * This can be backed by various storage types.
+   */
+  common::RefResourceView<common::CompressedByteT> gidx_buffer;
 
  private:
   std::shared_ptr<common::HistogramCuts const> cuts_;
diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
index 059dd9f213a5..86d1ac6da7eb 100644
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -4,11 +4,13 @@
 #include <dmlc/registry.h>
 
 #include <cstddef>  // for size_t
-#include <cstdint>  // for uint64_t
+#include <vector>   // for vector
 
-#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
-#include "../common/ref_resource_view.h"  // for ReadVec, WriteVec
-#include "ellpack_page.cuh"               // for EllpackPage
+#include "../common/cuda_rt_utils.h"
+#include "../common/io.h"                   // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "../common/ref_resource_view.cuh"  // for MakeFixedVecWithCudaMalloc
+#include "../common/ref_resource_view.h"    // for ReadVec, WriteVec
+#include "ellpack_page.cuh"                 // for EllpackPage
 #include "ellpack_page_raw_format.h"
 #include "ellpack_page_source.h"
 
@@ -16,8 +18,12 @@ namespace xgboost::data {
 DMLC_REGISTRY_FILE_TAG(ellpack_page_raw_format);
 
 namespace {
+// Function to support system without HMM or ATS
 template <typename T>
-[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi, HostDeviceVector<T>* vec) {
+[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi,
+                                 common::RefResourceView<T>* vec) {
+  xgboost_NVTX_FN_RANGE();
+
   std::uint64_t n{0};
   if (!fi->Read(&n)) {
     return false;
@@ -33,82 +39,82 @@ template <typename T>
     return false;
   }
 
-  vec->Resize(n);
-  auto d_vec = vec->DeviceSpan();
-  dh::safe_cuda(
-      cudaMemcpyAsync(d_vec.data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream()));
+  auto ctx = Context{}.MakeCUDA(common::CurrentDevice());
+  *vec = common::MakeFixedVecWithCudaMalloc<T>(&ctx, n);
+  dh::safe_cuda(cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream()));
   return true;
 }
 }  // namespace
 
+#define RET_IF_NOT(expr) \
+  if (!(expr)) {         \
+    return false;        \
+  }
+
 [[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page,
                                               common::AlignedResourceReadStream* fi) {
+  xgboost_NVTX_FN_RANGE();
   auto* impl = page->Impl();
+
   impl->SetCuts(this->cuts_);
-  if (!fi->Read(&impl->n_rows)) {
-    return false;
-  }
-  if (!fi->Read(&impl->is_dense)) {
-    return false;
-  }
-  if (!fi->Read(&impl->row_stride)) {
-    return false;
-  }
-  impl->gidx_buffer.SetDevice(device_);
-  if (!ReadDeviceVec(fi, &impl->gidx_buffer)) {
-    return false;
-  }
-  if (!fi->Read(&impl->base_rowid)) {
-    return false;
+  RET_IF_NOT(fi->Read(&impl->n_rows));
+  RET_IF_NOT(fi->Read(&impl->is_dense));
+  RET_IF_NOT(fi->Read(&impl->row_stride));
+
+  if (has_hmm_ats_) {
+    RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer));
+  } else {
+    RET_IF_NOT(ReadDeviceVec(fi, &impl->gidx_buffer));
   }
+  RET_IF_NOT(fi->Read(&impl->base_rowid));
+  dh::DefaultStream().Sync();
   return true;
 }
 
 [[nodiscard]] std::size_t EllpackPageRawFormat::Write(const EllpackPage& page,
                                                       common::AlignedFileWriteStream* fo) {
+  xgboost_NVTX_FN_RANGE();
+
   std::size_t bytes{0};
   auto* impl = page.Impl();
   bytes += fo->Write(impl->n_rows);
   bytes += fo->Write(impl->is_dense);
   bytes += fo->Write(impl->row_stride);
-  CHECK(!impl->gidx_buffer.ConstHostVector().empty());
-  bytes += common::WriteVec(fo, impl->gidx_buffer.HostVector());
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
+  [[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer);
+  bytes += common::WriteVec(fo, h_gidx_buffer);
   bytes += fo->Write(impl->base_rowid);
   dh::DefaultStream().Sync();
   return bytes;
 }
 
 [[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page, EllpackHostCacheStream* fi) const {
+  xgboost_NVTX_FN_RANGE();
+
   auto* impl = page->Impl();
   CHECK(this->cuts_->cut_values_.DeviceCanRead());
   impl->SetCuts(this->cuts_);
-  if (!fi->Read(&impl->n_rows)) {
-    return false;
-  }
-  if (!fi->Read(&impl->is_dense)) {
-    return false;
-  }
-  if (!fi->Read(&impl->row_stride)) {
-    return false;
-  }
 
-  // Read vec
-  bst_idx_t n{0};
-  if (!fi->Read(&n)) {
-    return false;
-  }
-  if (n != 0) {
-    impl->gidx_buffer.SetDevice(device_);
-    impl->gidx_buffer.Resize(n);
-    auto span = impl->gidx_buffer.DeviceSpan();
-    if (!fi->Read(span.data(), span.size_bytes())) {
-      return false;
+  // Read vector
+  Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
+  auto read_vec = [&] {
+    common::NvtxScopedRange range{common::NvtxEventAttr{"read-vec", common::NvtxRgb{127, 255, 0}}};
+    bst_idx_t n{0};
+    RET_IF_NOT(fi->Read(&n));
+    if (n == 0) {
+      return true;
     }
-  }
+    impl->gidx_buffer = common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(&ctx, n);
+    RET_IF_NOT(fi->Read(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes()));
+    return true;
+  };
+  RET_IF_NOT(read_vec());
 
-  if (!fi->Read(&impl->base_rowid)) {
-    return false;
-  }
+  RET_IF_NOT(fi->Read(&impl->n_rows));
+  RET_IF_NOT(fi->Read(&impl->is_dense));
+  RET_IF_NOT(fi->Read(&impl->row_stride));
+  RET_IF_NOT(fi->Read(&impl->base_rowid));
 
   dh::DefaultStream().Sync();
   return true;
@@ -116,23 +122,32 @@ template <typename T>
 
 [[nodiscard]] std::size_t EllpackPageRawFormat::Write(const EllpackPage& page,
                                                       EllpackHostCacheStream* fo) const {
+  xgboost_NVTX_FN_RANGE();
+
   bst_idx_t bytes{0};
   auto* impl = page.Impl();
-  bytes += fo->Write(impl->n_rows);
-  bytes += fo->Write(impl->is_dense);
-  bytes += fo->Write(impl->row_stride);
 
   // Write vector
-  bst_idx_t n = impl->gidx_buffer.Size();
-  bytes += fo->Write(n);
+  auto write_vec = [&] {
+    common::NvtxScopedRange range{common::NvtxEventAttr{"write-vec", common::NvtxRgb{127, 255, 0}}};
+    bst_idx_t n = impl->gidx_buffer.size();
+    bytes += fo->Write(n);
 
-  if (!impl->gidx_buffer.Empty()) {
-    auto span = impl->gidx_buffer.ConstDeviceSpan();
-    bytes += fo->Write(span.data(), span.size_bytes());
-  }
+    if (!impl->gidx_buffer.empty()) {
+      bytes += fo->Write(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes());
+    }
+  };
+
+  write_vec();
+
+  bytes += fo->Write(impl->n_rows);
+  bytes += fo->Write(impl->is_dense);
+  bytes += fo->Write(impl->row_stride);
   bytes += fo->Write(impl->base_rowid);
 
   dh::DefaultStream().Sync();
   return bytes;
 }
+
+#undef RET_IF_NOT
 }  // namespace xgboost::data
diff --git a/src/data/ellpack_page_raw_format.h b/src/data/ellpack_page_raw_format.h
index 8c3f89f0c0b3..e2761c73f27c 100644
--- a/src/data/ellpack_page_raw_format.h
+++ b/src/data/ellpack_page_raw_format.h
@@ -26,10 +26,13 @@ class EllpackHostCacheStream;
 class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
   std::shared_ptr<common::HistogramCuts const> cuts_;
   DeviceOrd device_;
+  // Supports CUDA HMM or ATS
+  bool has_hmm_ats_{false};
 
  public:
-  explicit EllpackPageRawFormat(std::shared_ptr<common::HistogramCuts const> cuts, DeviceOrd device)
-      : cuts_{std::move(cuts)}, device_{device} {}
+  explicit EllpackPageRawFormat(std::shared_ptr<common::HistogramCuts const> cuts, DeviceOrd device,
+                                bool has_hmm_ats)
+      : cuts_{std::move(cuts)}, device_{device}, has_hmm_ats_{has_hmm_ats} {}
   [[nodiscard]] bool Read(EllpackPage* page, common::AlignedResourceReadStream* fi) override;
   [[nodiscard]] std::size_t Write(const EllpackPage& page,
                                   common::AlignedFileWriteStream* fo) override;
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index f53ae3ef1e2f..a70d9150ca47 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -11,6 +11,7 @@
 #include "../common/common.h"                 // for safe_cuda
 #include "../common/cuda_pinned_allocator.h"  // for pinned_allocator
 #include "../common/device_helpers.cuh"       // for CUDAStreamView, DefaultStream
+#include "../common/resource.cuh"             // for PrivateCudaMmapConstStream
 #include "ellpack_page.cuh"                   // for EllpackPageImpl
 #include "ellpack_page.h"                     // for EllpackPage
 #include "ellpack_page_source.h"
@@ -86,16 +87,16 @@ void EllpackHostCacheStream::Seek(bst_idx_t offset_bytes) { this->p_impl_->Seek(
 void EllpackHostCacheStream::Bound(bst_idx_t offset_bytes) { this->p_impl_->Bound(offset_bytes); }
 
 /**
- * EllpackFormatType
+ * EllpackCacheStreamPolicy
  */
 
 template <typename S, template <typename> typename F>
-EllpackFormatStreamPolicy<S, F>::EllpackFormatStreamPolicy()
+EllpackCacheStreamPolicy<S, F>::EllpackCacheStreamPolicy()
     : p_cache_{std::make_shared<EllpackHostCache>()} {}
 
 template <typename S, template <typename> typename F>
-[[nodiscard]] std::unique_ptr<typename EllpackFormatStreamPolicy<S, F>::WriterT>
-EllpackFormatStreamPolicy<S, F>::CreateWriter(StringView, std::uint32_t iter) {
+[[nodiscard]] std::unique_ptr<typename EllpackCacheStreamPolicy<S, F>::WriterT>
+EllpackCacheStreamPolicy<S, F>::CreateWriter(StringView, std::uint32_t iter) {
   auto fo = std::make_unique<EllpackHostCacheStream>(this->p_cache_);
   if (iter == 0) {
     CHECK(this->p_cache_->cache.empty());
@@ -106,9 +107,8 @@ EllpackFormatStreamPolicy<S, F>::CreateWriter(StringView, std::uint32_t iter) {
 }
 
 template <typename S, template <typename> typename F>
-[[nodiscard]] std::unique_ptr<typename EllpackFormatStreamPolicy<S, F>::ReaderT>
-EllpackFormatStreamPolicy<S, F>::CreateReader(StringView, bst_idx_t offset,
-                                              bst_idx_t length) const {
+[[nodiscard]] std::unique_ptr<typename EllpackCacheStreamPolicy<S, F>::ReaderT>
+EllpackCacheStreamPolicy<S, F>::CreateReader(StringView, bst_idx_t offset, bst_idx_t length) const {
   auto fi = std::make_unique<ReaderT>(this->p_cache_);
   fi->Seek(offset);
   fi->Bound(offset + length);
@@ -117,18 +117,40 @@ EllpackFormatStreamPolicy<S, F>::CreateReader(StringView, bst_idx_t offset,
 }
 
 // Instantiation
-template EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::EllpackFormatStreamPolicy();
+template EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::EllpackCacheStreamPolicy();
 
 template std::unique_ptr<
-    typename EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::WriterT>
-EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateWriter(StringView name,
-                                                                          std::uint32_t iter);
+    typename EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::WriterT>
+EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateWriter(StringView name,
+                                                                         std::uint32_t iter);
 
 template std::unique_ptr<
-    typename EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::ReaderT>
-EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(
+    typename EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::ReaderT>
+EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(
     StringView name, std::uint64_t offset, std::uint64_t length) const;
 
+/**
+ * EllpackMmapStreamPolicy
+ */
+
+template <typename S, template <typename> typename F>
+[[nodiscard]] std::unique_ptr<typename EllpackMmapStreamPolicy<S, F>::ReaderT>
+EllpackMmapStreamPolicy<S, F>::CreateReader(StringView name, bst_idx_t offset,
+                                            bst_idx_t length) const {
+  if (has_hmm_) {
+    return std::make_unique<common::PrivateCudaMmapConstStream>(name, offset, length);
+  } else {
+    return std::make_unique<common::PrivateMmapConstStream>(name, offset, length);
+  }
+}
+
+// Instantiation
+template std::unique_ptr<
+    typename EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>::ReaderT>
+EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(StringView name,
+                                                                        bst_idx_t offset,
+                                                                        bst_idx_t length) const;
+
 /**
  * EllpackPageSourceImpl
  */
@@ -146,8 +168,8 @@ void EllpackPageSourceImpl<F>::Fetch() {
     auto const& csr = this->source_->Page();
     this->page_.reset(new EllpackPage{});
     auto* impl = this->page_->Impl();
-    *impl = EllpackPageImpl{this->Device(), this->GetCuts(), *csr,
-                            is_dense_,      row_stride_,     feature_types_};
+    Context ctx = Context{}.MakeCUDA(this->Device().ordinal);
+    *impl = EllpackPageImpl{&ctx, this->GetCuts(), *csr, is_dense_, row_stride_, feature_types_};
     this->page_->SetBaseRowId(csr->base_rowid);
     this->WriteCache();
   }
@@ -157,5 +179,7 @@ void EllpackPageSourceImpl<F>::Fetch() {
 template void
 EllpackPageSourceImpl<DefaultFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>::Fetch();
 template void
-EllpackPageSourceImpl<EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>::Fetch();
+EllpackPageSourceImpl<EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>>::Fetch();
+template void
+EllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>::Fetch();
 }  // namespace xgboost::data
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index 7f50899b974d..1436f9151cf1 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -9,6 +9,7 @@
 #include <memory>   // for shared_ptr
 #include <utility>  // for move
 
+#include "../common/cuda_rt_utils.h"  // for SupportsPageableMem
 #include "../common/hist_util.h"      // for HistogramCuts
 #include "ellpack_page.h"             // for EllpackPage
 #include "ellpack_page_raw_format.h"  // for EllpackPageRawFormat
@@ -59,14 +60,19 @@ template <typename S>
 class EllpackFormatPolicy {
   std::shared_ptr<common::HistogramCuts const> cuts_{nullptr};
   DeviceOrd device_;
+  bool has_hmm_{common::SupportsPageableMem()};
 
  public:
   using FormatT = EllpackPageRawFormat;
 
  public:
+  EllpackFormatPolicy() = default;
+  // For testing with the HMM flag.
+  explicit EllpackFormatPolicy(bool has_hmm) : has_hmm_{has_hmm} {}
+
   [[nodiscard]] auto CreatePageFormat() const {
     CHECK_EQ(cuts_->cut_values_.Device(), device_);
-    std::unique_ptr<FormatT> fmt{new EllpackPageRawFormat{cuts_, device_}};
+    std::unique_ptr<FormatT> fmt{new EllpackPageRawFormat{cuts_, device_, has_hmm_}};
     return fmt;
   }
 
@@ -83,7 +89,7 @@ class EllpackFormatPolicy {
 };
 
 template <typename S, template <typename> typename F>
-class EllpackFormatStreamPolicy : public F<S> {
+class EllpackCacheStreamPolicy : public F<S> {
   std::shared_ptr<EllpackHostCache> p_cache_;
 
  public:
@@ -91,13 +97,42 @@ class EllpackFormatStreamPolicy : public F<S> {
   using ReaderT = EllpackHostCacheStream;
 
  public:
-  EllpackFormatStreamPolicy();
+  EllpackCacheStreamPolicy();
   [[nodiscard]] std::unique_ptr<WriterT> CreateWriter(StringView name, std::uint32_t iter);
 
   [[nodiscard]] std::unique_ptr<ReaderT> CreateReader(StringView name, bst_idx_t offset,
                                                       bst_idx_t length) const;
 };
 
+template <typename S, template <typename> typename F>
+class EllpackMmapStreamPolicy : public F<S> {
+  bool has_hmm_{common::SupportsPageableMem()};
+
+ public:
+  using WriterT = common::AlignedFileWriteStream;
+  using ReaderT = common::AlignedResourceReadStream;
+
+ public:
+  EllpackMmapStreamPolicy() = default;
+  // For testing with the HMM flag.
+  template <
+      typename std::enable_if_t<std::is_same_v<F<S>, EllpackFormatPolicy<EllpackPage>>>* = nullptr>
+  explicit EllpackMmapStreamPolicy(bool has_hmm) : F<S>{has_hmm}, has_hmm_{has_hmm} {}
+
+  [[nodiscard]] std::unique_ptr<WriterT> CreateWriter(StringView name, std::uint32_t iter) {
+    std::unique_ptr<common::AlignedFileWriteStream> fo;
+    if (iter == 0) {
+      fo = std::make_unique<common::AlignedFileWriteStream>(name, "wb");
+    } else {
+      fo = std::make_unique<common::AlignedFileWriteStream>(name, "ab");
+    }
+    return fo;
+  }
+
+  [[nodiscard]] std::unique_ptr<ReaderT> CreateReader(StringView name, bst_idx_t offset,
+                                                      bst_idx_t length) const;
+};
+
 template <typename F>
 class EllpackPageSourceImpl : public PageSourceIncMixIn<EllpackPage, F> {
   using Super = PageSourceIncMixIn<EllpackPage, F>;
@@ -128,11 +163,11 @@ class EllpackPageSourceImpl : public PageSourceIncMixIn<EllpackPage, F> {
 
 // Cache to host
 using EllpackPageHostSource =
-    EllpackPageSourceImpl<EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>;
+    EllpackPageSourceImpl<EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>>;
 
 // Cache to disk
 using EllpackPageSource =
-    EllpackPageSourceImpl<DefaultFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>;
+    EllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>;
 
 #if !defined(XGBOOST_USE_CUDA)
 template <typename F>
diff --git a/src/data/gradient_index.cu b/src/data/gradient_index.cu
index 42018eab47e3..f8c8f8d48970 100644
--- a/src/data/gradient_index.cu
+++ b/src/data/gradient_index.cu
@@ -16,7 +16,8 @@ template <typename BinT, typename CompressOffset>
 void SetIndexData(Context const* ctx, EllpackPageImpl const* page,
                   std::vector<size_t>* p_hit_count_tloc, CompressOffset&& get_offset,
                   GHistIndexMatrix* out) {
-  auto accessor = page->GetHostAccessor();
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer);
   auto const kNull = static_cast<bst_bin_t>(accessor.NullValue());
 
   common::Span<BinT> index_data_span = {out->index.data<BinT>(), out->index.Size()};
@@ -47,7 +48,8 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
   if (page->is_dense) {
     std::fill(row_ptr.begin() + 1, row_ptr.end(), page->row_stride);
   } else {
-    auto accessor = page->GetHostAccessor();
+    std::vector<common::CompressedByteT> h_gidx_buffer;
+    auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer);
     auto const kNull = static_cast<bst_bin_t>(accessor.NullValue());
 
     common::ParallelFor(page->Size(), ctx->Threads(), [&](auto i) {
diff --git a/src/data/histogram_cut_format.h b/src/data/histogram_cut_format.h
deleted file mode 100644
index d4eb81ad2849..000000000000
--- a/src/data/histogram_cut_format.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Copyright 2021-2024, XGBoost contributors
- */
-#ifndef XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
-#define XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
-
-#include <dmlc/io.h>  // for Stream
-
-#include <cstddef>  // for size_t
-
-#include "../common/hist_util.h"          // for HistogramCuts
-#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
-#include "../common/ref_resource_view.h"  // for WriteVec, ReadVec
-
-namespace xgboost::data {
-inline bool ReadHistogramCuts(common::HistogramCuts *cuts, common::AlignedResourceReadStream *fi) {
-  if (!common::ReadVec(fi, &cuts->cut_values_.HostVector())) {
-    return false;
-  }
-  if (!common::ReadVec(fi, &cuts->cut_ptrs_.HostVector())) {
-    return false;
-  }
-  if (!common::ReadVec(fi, &cuts->min_vals_.HostVector())) {
-    return false;
-  }
-  bool has_cat{false};
-  if (!fi->Read(&has_cat)) {
-    return false;
-  }
-  decltype(cuts->MaxCategory()) max_cat{0};
-  if (!fi->Read(&max_cat)) {
-    return false;
-  }
-  cuts->SetCategorical(has_cat, max_cat);
-  return true;
-}
-
-inline std::size_t WriteHistogramCuts(common::HistogramCuts const &cuts,
-                                      common::AlignedFileWriteStream *fo) {
-  std::size_t bytes = 0;
-  bytes += common::WriteVec(fo, cuts.Values());
-  bytes += common::WriteVec(fo, cuts.Ptrs());
-  bytes += common::WriteVec(fo, cuts.MinValues());
-  bytes += fo->Write(cuts.HasCategorical());
-  bytes += fo->Write(cuts.MaxCategory());
-  return bytes;
-}
-}  // namespace xgboost::data
-#endif  // XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 2e8da2c7e7ed..0cb32c5aa107 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -5,6 +5,7 @@
 #include <memory>
 
 #include "../collective/allreduce.h"
+#include "../common/cuda_rt_utils.h"  // for AllVisibleGPUs
 #include "../common/hist_util.cuh"
 #include "batch_utils.h"  // for RegenGHist
 #include "device_adapter.cuh"
@@ -45,11 +46,17 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
 
   int32_t current_device;
   dh::safe_cuda(cudaGetDevice(&current_device));
+  auto get_ctx = [&]() {
+    Context d_ctx = (ctx->IsCUDA()) ? *ctx : Context{}.MakeCUDA(current_device);
+    CHECK(!d_ctx.IsCPU());
+    return d_ctx;
+  };
   auto get_device = [&]() {
     auto d = (ctx->IsCUDA()) ? ctx->Device() : DeviceOrd::CUDA(current_device);
     CHECK(!d.IsCPU());
     return d;
   };
+  fmat_ctx_ = get_ctx();
 
   /**
    * Generate quantiles
@@ -118,7 +125,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
       // that case device id is invalid.
       ellpack_.reset(new EllpackPage);
       *(ellpack_->Impl()) =
-          EllpackPageImpl(get_device(), cuts, this->IsDense(), row_stride, accumulated_rows);
+          EllpackPageImpl(&fmat_ctx_, cuts, this->IsDense(), row_stride, accumulated_rows);
     }
   };
 
@@ -142,10 +149,10 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
     proxy->Info().feature_types.SetDevice(get_device());
     auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan();
     auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) {
-      return EllpackPageImpl(value, missing, get_device(), is_dense, row_counts_span,
-                             d_feature_types, row_stride, rows, cuts);
+      return EllpackPageImpl(&fmat_ctx_, value, missing, is_dense, row_counts_span, d_feature_types,
+                             row_stride, rows, cuts);
     });
-    size_t num_elements = ellpack_->Impl()->Copy(get_device(), &new_impl, offset);
+    std::size_t num_elements = ellpack_->Impl()->Copy(&fmat_ctx_, &new_impl, offset);
     offset += num_elements;
 
     proxy->Info().num_row_ = num_rows();
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index 550631b72dc5..62b39886ed62 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -226,7 +226,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
     }
     // An heuristic for number of pre-fetched batches.  We can make it part of BatchParam
     // to let user adjust number of pre-fetched batches when needed.
-    std::int32_t kPrefetches = 3;
+    std::int32_t constexpr kPrefetches = 3;
     std::int32_t n_prefetches = std::min(nthreads_, kPrefetches);
     n_prefetches = std::max(n_prefetches, 1);
     std::int32_t n_prefetch_batches = std::min(static_cast<bst_idx_t>(n_prefetches), n_batches_);
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index 71905debcb3a..2d288fa9d025 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -10,12 +10,12 @@
 
 #include <algorithm>
 #include <numeric>
-#include <sstream>
 #include <string>
 #include <vector>
 
 #include "../common/common.h"
-#include "../common/error_msg.h"  // NoCategorical, DeprecatedFunc
+#include "../common/cuda_rt_utils.h"  // for AllVisibleGPUs
+#include "../common/error_msg.h"      // NoCategorical, DeprecatedFunc
 #include "../common/threading_utils.h"
 #include "../common/timer.h"
 #include "gblinear_model.h"
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 9ff4abb4d00a..26c768fafea7 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023 by Contributors
+ * Copyright 2014-2024, XGBoost Contributors
  * \file gbtree.cc
  * \brief gradient boosted tree implementation.
  * \author Tianqi Chen
@@ -10,14 +10,14 @@
 #include <dmlc/parameter.h>
 
 #include <algorithm>  // for equal
-#include <cinttypes>  // for uint32_t
-#include <limits>
+#include <cstdint>    // for uint32_t
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "../common/common.h"
+#include "../common/cuda_rt_utils.h"  // for AllVisibleGPUs
 #include "../common/error_msg.h"  // for UnknownDevice, WarnOldSerialization, InplacePredictProxy
 #include "../common/random.h"
 #include "../common/threading_utils.h"
@@ -347,7 +347,7 @@ void GBTree::LoadConfig(Json const& in) {
   // This would cause all trees to be pushed to trees_to_update
   // e.g. updating a model, then saving and loading it would result in an empty model
   tparam_.process_type = TreeProcessType::kDefault;
-  std::int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
+  std::int32_t const n_gpus = common::AllVisibleGPUs();
 
   auto msg = StringView{
       R"(
diff --git a/src/learner.cc b/src/learner.cc
index 8c2e92cebae9..46abdf0bf365 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1317,7 +1317,7 @@ class LearnerImpl : public LearnerIO {
     std::ostringstream os;
     os.precision(std::numeric_limits<double>::max_digits10);
     os << '[' << iter << ']' << std::setiosflags(std::ios::fixed);
-    if (metrics_.empty() && tparam_.disable_default_eval_metric <= 0) {
+    if (metrics_.empty() && !tparam_.disable_default_eval_metric) {
       metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric(), &ctx_));
       auto config = obj_->DefaultMetricConfig();
       if (!IsA<Null>(config)) {
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 29fb6bb6a162..fe46e19ec63b 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -16,6 +16,7 @@
 #include "../common/categorical.h"
 #include "../common/common.h"
 #include "../common/cuda_context.cuh"  // for CUDAContext
+#include "../common/cuda_rt_utils.h"   // for AllVisibleGPUs
 #include "../common/device_helpers.cuh"
 #include "../common/error_msg.h"  // for InplacePredictProxy
 #include "../data/device_adapter.cuh"
diff --git a/src/tree/fit_stump.cu b/src/tree/fit_stump.cu
index dd71465df1cc..4f1f994a6f38 100644
--- a/src/tree/fit_stump.cu
+++ b/src/tree/fit_stump.cu
@@ -3,9 +3,6 @@
  *
  * @brief Utilities for estimating initial score.
  */
-#if !defined(NOMINMAX) && defined(_WIN32)
-#define NOMINMAX
-#endif                                          // !defined(NOMINMAX)
 #include <thrust/execution_policy.h>            // cuda::par
 #include <thrust/iterator/counting_iterator.h>  // thrust::make_counting_iterator
 
diff --git a/src/tree/fit_stump.h b/src/tree/fit_stump.h
index 2af779f77c46..ab947a659464 100644
--- a/src/tree/fit_stump.h
+++ b/src/tree/fit_stump.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  *
  * \brief Utilities for estimating initial score.
  */
@@ -7,18 +7,12 @@
 #ifndef XGBOOST_TREE_FIT_STUMP_H_
 #define XGBOOST_TREE_FIT_STUMP_H_
 
-#if !defined(NOMINMAX) && defined(_WIN32)
-#define NOMINMAX
-#endif  // !defined(NOMINMAX)
-
 #include <algorithm>  // std::max
 
-#include "../common/common.h"            // AssertGPUSupport
-#include "xgboost/base.h"                // GradientPair
-#include "xgboost/context.h"             // Context
-#include "xgboost/data.h"                // MetaInfo
-#include "xgboost/host_device_vector.h"  // HostDeviceVector
-#include "xgboost/linalg.h"              // TensorView
+#include "xgboost/base.h"     // GradientPair
+#include "xgboost/context.h"  // Context
+#include "xgboost/data.h"     // MetaInfo
+#include "xgboost/linalg.h"   // TensorView
 
 namespace xgboost {
 namespace tree {
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu
index d2031ca21898..3235e9ec3ec1 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -163,14 +163,14 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
   if (!page_concatenated_) {
     // Concatenate all the external memory ELLPACK pages into a single in-memory page.
     page_.reset(nullptr);
-    size_t offset = 0;
+    bst_idx_t offset = 0;
     for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
       auto page = batch.Impl();
       if (!page_) {
-        page_ = std::make_unique<EllpackPageImpl>(ctx->Device(), page->CutsShared(), page->is_dense,
+        page_ = std::make_unique<EllpackPageImpl>(ctx, page->CutsShared(), page->is_dense,
                                                   page->row_stride, dmat->Info().num_row_);
       }
-      size_t num_elements = page_->Copy(ctx->Device(), page, offset);
+      bst_idx_t num_elements = page_->Copy(ctx, page, offset);
       offset += num_elements;
     }
     page_concatenated_ = true;
@@ -228,11 +228,11 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
   auto first_page = (*batch_iterator.begin()).Impl();
   // Create a new ELLPACK page with empty rows.
   page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(ctx->Device(), first_page->CutsShared(), first_page->is_dense,
+  page_.reset(new EllpackPageImpl(ctx, first_page->CutsShared(), first_page->is_dense,
                                   first_page->row_stride, sample_rows));
 
   // Compact the ELLPACK pages into the single sample page.
-  thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
+  thrust::fill(cuctx->CTP(), page_->gidx_buffer.begin(), page_->gidx_buffer.end(), 0);
   for (auto& batch : batch_iterator) {
     page_->Compact(ctx, batch.Impl(), dh::ToSpan(sample_row_index_));
   }
@@ -283,10 +283,10 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
   // Perform Poisson sampling in place.
   thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
                     thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
-                    PoissonSampling(dh::ToSpan(threshold_), threshold_index,
-                                    RandomWeight(common::GlobalRandom()())));
+                    PoissonSampling{dh::ToSpan(threshold_), threshold_index,
+                                    RandomWeight(common::GlobalRandom()())});
   // Count the sampled rows.
-  size_t sample_rows =
+  bst_idx_t sample_rows =
       thrust::count_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), IsNonZero());
   // Compact gradient pairs.
   gpair_.resize(sample_rows);
@@ -302,10 +302,10 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
   auto first_page = (*batch_iterator.begin()).Impl();
   // Create a new ELLPACK page with empty rows.
   page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(ctx->Device(), first_page->CutsShared(), dmat->IsDense(),
-                                  first_page->row_stride, sample_rows));
+  page_.reset(new EllpackPageImpl{ctx, first_page->CutsShared(), dmat->IsDense(),
+                                  first_page->row_stride, sample_rows});
   // Compact the ELLPACK pages into the single sample page.
-  thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
+  thrust::fill(cuctx->CTP(), page_->gidx_buffer.begin(), page_->gidx_buffer.end(), 0);
   for (auto& batch : batch_iterator) {
     page_->Compact(ctx, batch.Impl(), dh::ToSpan(sample_row_index_));
   }
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cuh b/src/tree/gpu_hist/gradient_based_sampler.cuh
index 5a57e2ae8ef8..79008b1ae572 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cuh
+++ b/src/tree/gpu_hist/gradient_based_sampler.cuh
@@ -1,20 +1,19 @@
 /**
- * Copyright 2019-2023, XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #pragma once
-#include <xgboost/base.h>
-#include <xgboost/data.h>
-#include <xgboost/span.h>
+#include <cstddef>  // for size_t
 
-#include "../../common/device_helpers.cuh"
-#include "../../data/ellpack_page.cuh"
-
-namespace xgboost {
-namespace tree {
+#include "../../common/device_vector.cuh"  // for device_vector, caching_device_vector
+#include "../../data/ellpack_page.cuh"     // for EllpackPageImpl
+#include "xgboost/base.h"                  // for GradientPair
+#include "xgboost/data.h"                  // for BatchParam
+#include "xgboost/span.h"                  // for Span
 
+namespace xgboost::tree {
 struct GradientBasedSample {
   /*!\brief Number of sampled rows. */
-  size_t sample_rows;
+  std::size_t sample_rows;
   /*!\brief Sampled rows in ELLPACK format. */
   EllpackPageImpl const* page;
   /*!\brief Gradient pairs for the sampled rows. */
@@ -137,5 +136,4 @@ class GradientBasedSampler {
   common::Monitor monitor_;
   std::unique_ptr<SamplingStrategy> strategy_;
 };
-};  // namespace tree
-};  // namespace xgboost
+};  // namespace xgboost::tree
diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
index 862821b00b63..87c60a8bfdbc 100644
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -5,12 +5,14 @@
 #define HISTOGRAM_CUH_
 #include <memory>  // for unique_ptr
 
-#include "../../common/cuda_context.cuh"  // for CUDAContext
-#include "../../data/ellpack_page.cuh"    // for EllpackDeviceAccessor
-#include "feature_groups.cuh"             // for FeatureGroupsAccessor
-#include "xgboost/base.h"                 // for GradientPair, GradientPairInt64
-#include "xgboost/context.h"              // for Context
-#include "xgboost/span.h"                 // for Span
+#include "../../common/cuda_context.cuh"    // for CUDAContext
+#include "../../common/device_helpers.cuh"  // for LaunchN
+#include "../../common/device_vector.cuh"   // for device_vector
+#include "../../data/ellpack_page.cuh"      // for EllpackDeviceAccessor
+#include "feature_groups.cuh"               // for FeatureGroupsAccessor
+#include "xgboost/base.h"                   // for GradientPair, GradientPairInt64
+#include "xgboost/context.h"                // for Context
+#include "xgboost/span.h"                   // for Span
 
 namespace xgboost::tree {
 /**
@@ -60,6 +62,111 @@ class GradientQuantiser {
   }
 };
 
+/**
+ * @brief Data storage for node histograms on device. Automatically expands.
+ *
+ * @tparam kStopGrowingSize  Do not grow beyond this size
+ *
+ * @author  Rory
+ * @date    28/07/2018
+ */
+template <size_t kStopGrowingSize = 1 << 28>
+class DeviceHistogramStorage {
+ private:
+  using GradientSumT = GradientPairInt64;
+  /** @brief Map nidx to starting index of its histogram. */
+  std::map<int, size_t> nidx_map_;
+  // Large buffer of zeroed memory, caches histograms
+  dh::device_vector<typename GradientSumT::ValueT> data_;
+  // If we run out of storage allocate one histogram at a time
+  // in overflow. Not cached, overwritten when a new histogram
+  // is requested
+  dh::device_vector<typename GradientSumT::ValueT> overflow_;
+  std::map<int, size_t> overflow_nidx_map_;
+  int n_bins_;
+  DeviceOrd device_id_;
+  static constexpr size_t kNumItemsInGradientSum =
+      sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
+  static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
+
+ public:
+  // Start with about 16mb
+  DeviceHistogramStorage() { data_.reserve(1 << 22); }
+  void Init(DeviceOrd device_id, int n_bins) {
+    this->n_bins_ = n_bins;
+    this->device_id_ = device_id;
+  }
+
+  void Reset(Context const* ctx) {
+    auto d_data = data_.data().get();
+    dh::LaunchN(data_.size(), ctx->CUDACtx()->Stream(),
+                [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
+    nidx_map_.clear();
+    overflow_nidx_map_.clear();
+  }
+  [[nodiscard]] bool HistogramExists(int nidx) const {
+    return nidx_map_.find(nidx) != nidx_map_.cend() ||
+           overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
+  }
+  [[nodiscard]] int Bins() const { return n_bins_; }
+  [[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
+  dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
+
+  void AllocateHistograms(Context const* ctx, const std::vector<int>& new_nidxs) {
+    for (int nidx : new_nidxs) {
+      CHECK(!HistogramExists(nidx));
+    }
+    // Number of items currently used in data
+    const size_t used_size = nidx_map_.size() * HistogramSize();
+    const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
+    if (used_size >= kStopGrowingSize) {
+      // Use overflow
+      // Delete previous entries
+      overflow_nidx_map_.clear();
+      overflow_.resize(HistogramSize() * new_nidxs.size());
+      // Zero memory
+      auto d_data = overflow_.data().get();
+      dh::LaunchN(overflow_.size(), ctx->CUDACtx()->Stream(),
+                  [=] __device__(size_t idx) { d_data[idx] = 0.0; });
+      // Append new histograms
+      for (int nidx : new_nidxs) {
+        overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize();
+      }
+    } else {
+      CHECK_GE(data_.size(), used_size);
+      // Expand if necessary
+      if (data_.size() < new_used_size) {
+        data_.resize(std::max(data_.size() * 2, new_used_size));
+      }
+      // Append new histograms
+      for (int nidx : new_nidxs) {
+        nidx_map_[nidx] = nidx_map_.size() * HistogramSize();
+      }
+    }
+
+    CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize());
+  }
+
+  /**
+   * \summary   Return pointer to histogram memory for a given node.
+   * \param nidx    Tree node index.
+   * \return    hist pointer.
+   */
+  common::Span<GradientSumT> GetNodeHistogram(int nidx) {
+    CHECK(this->HistogramExists(nidx));
+
+    if (nidx_map_.find(nidx) != nidx_map_.cend()) {
+      // Fetch from normal cache
+      auto ptr = data_.data().get() + nidx_map_.at(nidx);
+      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
+    } else {
+      // Fetch from overflow
+      auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
+      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
+    }
+  }
+};
+
 class DeviceHistogramBuilderImpl;
 
 class DeviceHistogramBuilder {
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 19957857218d..83f84ec1f4a5 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -16,7 +16,8 @@
 #include "../collective/broadcast.h"
 #include "../common/bitfield.h"
 #include "../common/categorical.h"
-#include "../common/cuda_context.cuh"  // CUDAContext
+#include "../common/cuda_context.cuh"  // for CUDAContext
+#include "../common/cuda_rt_utils.h"   // for CheckComputeCapability
 #include "../common/device_helpers.cuh"
 #include "../common/hist_util.h"
 #include "../common/random.h"  // for ColumnSampler, GlobalRandom
@@ -48,113 +49,6 @@ namespace xgboost::tree {
 DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
 #endif  // !defined(GTEST_TEST)
 
-/**
- * \struct  DeviceHistogramStorage
- *
- * \summary Data storage for node histograms on device. Automatically expands.
- *
- * \tparam GradientSumT      histogram entry type.
- * \tparam kStopGrowingSize  Do not grow beyond this size
- *
- * \author  Rory
- * \date    28/07/2018
- */
-template <size_t kStopGrowingSize = 1 << 28>
-class DeviceHistogramStorage {
- private:
-  using GradientSumT = GradientPairInt64;
-  /*! \brief Map nidx to starting index of its histogram. */
-  std::map<int, size_t> nidx_map_;
-  // Large buffer of zeroed memory, caches histograms
-  dh::device_vector<typename GradientSumT::ValueT> data_;
-  // If we run out of storage allocate one histogram at a time
-  // in overflow. Not cached, overwritten when a new histogram
-  // is requested
-  dh::device_vector<typename GradientSumT::ValueT> overflow_;
-  std::map<int, size_t> overflow_nidx_map_;
-  int n_bins_;
-  DeviceOrd device_id_;
-  static constexpr size_t kNumItemsInGradientSum =
-      sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
-  static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
-
- public:
-  // Start with about 16mb
-  DeviceHistogramStorage() { data_.reserve(1 << 22); }
-  void Init(DeviceOrd device_id, int n_bins) {
-    this->n_bins_ = n_bins;
-    this->device_id_ = device_id;
-  }
-
-  void Reset() {
-    auto d_data = data_.data().get();
-    dh::LaunchN(data_.size(), [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
-    nidx_map_.clear();
-    overflow_nidx_map_.clear();
-  }
-  [[nodiscard]] bool HistogramExists(int nidx) const {
-    return nidx_map_.find(nidx) != nidx_map_.cend() ||
-           overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
-  }
-  [[nodiscard]] int Bins() const { return n_bins_; }
-  [[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
-  dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
-
-  void AllocateHistograms(const std::vector<int>& new_nidxs) {
-    for (int nidx : new_nidxs) {
-      CHECK(!HistogramExists(nidx));
-    }
-    // Number of items currently used in data
-    const size_t used_size = nidx_map_.size() * HistogramSize();
-    const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
-    if (used_size >= kStopGrowingSize) {
-      // Use overflow
-      // Delete previous entries
-      overflow_nidx_map_.clear();
-      overflow_.resize(HistogramSize() * new_nidxs.size());
-      // Zero memory
-      auto d_data = overflow_.data().get();
-      dh::LaunchN(overflow_.size(),
-                  [=] __device__(size_t idx) { d_data[idx] = 0.0; });
-      // Append new histograms
-      for (int nidx : new_nidxs) {
-        overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize();
-      }
-    } else {
-      CHECK_GE(data_.size(), used_size);
-      // Expand if necessary
-      if (data_.size() < new_used_size) {
-        data_.resize(std::max(data_.size() * 2, new_used_size));
-      }
-      // Append new histograms
-      for (int nidx : new_nidxs) {
-        nidx_map_[nidx] = nidx_map_.size() * HistogramSize();
-      }
-    }
-
-    CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize());
-  }
-
-  /**
-   * \summary   Return pointer to histogram memory for a given node.
-   * \param nidx    Tree node index.
-   * \return    hist pointer.
-   */
-  common::Span<GradientSumT> GetNodeHistogram(int nidx) {
-    CHECK(this->HistogramExists(nidx));
-
-    if (nidx_map_.find(nidx) != nidx_map_.cend()) {
-      // Fetch from normal cache
-      auto ptr = data_.data().get() + nidx_map_.at(nidx);
-      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
-    } else {
-      // Fetch from overflow
-      auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
-      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
-    }
-  }
-};
-
 // Manage memory for a single GPU
 struct GPUHistMakerDevice {
  private:
@@ -257,7 +151,7 @@ struct GPUHistMakerDevice {
 
     // Init histogram
     hist.Init(ctx_->Device(), page->Cuts().TotalBins());
-    hist.Reset();
+    hist.Reset(ctx_);
 
     this->InitFeatureGroupsOnce();
 
@@ -656,7 +550,7 @@ struct GPUHistMakerDevice {
     all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end());
     // Allocate the histograms
     // Guaranteed contiguous memory
-    hist.AllocateHistograms(all_new);
+    hist.AllocateHistograms(ctx_, all_new);
 
     for (auto nidx : hist_nidx) {
       this->BuildHist(nidx);
@@ -747,7 +641,7 @@ struct GPUHistMakerDevice {
         ctx_, info_, linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2));
     collective::SafeColl(rc);
 
-    hist.AllocateHistograms({kRootNIdx});
+    hist.AllocateHistograms(ctx_, {kRootNIdx});
     this->BuildHist(kRootNIdx);
     this->AllReduceHist(kRootNIdx, 1);
 
@@ -826,7 +720,7 @@ class GPUHistMaker : public TreeUpdater {
     // Used in test to count how many configurations are performed
     LOG(DEBUG) << "[GPU Hist]: Configure";
     hist_maker_param_.UpdateAllowUnknown(args);
-    dh::CheckComputeCapability();
+    common::CheckComputeCapability();
     initialised_ = false;
 
     monitor_.Init("updater_gpu_hist");
@@ -852,17 +746,13 @@ class GPUHistMaker : public TreeUpdater {
     CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
     auto gpair_hdv = gpair->Data();
     // build tree
-    try {
-      std::size_t t_idx{0};
-      for (xgboost::RegTree* tree : trees) {
-        this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]);
-        this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree);
-        ++t_idx;
-      }
-      dh::safe_cuda(cudaGetLastError());
-    } catch (const std::exception& e) {
-      LOG(FATAL) << "Exception in gpu_hist: " << e.what() << std::endl;
+    std::size_t t_idx{0};
+    for (xgboost::RegTree* tree : trees) {
+      this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]);
+      this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree);
+      ++t_idx;
     }
+    dh::safe_cuda(cudaGetLastError());
     monitor_.Stop("Update");
   }
 
@@ -958,7 +848,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
     if (hist_maker_param_.max_cached_hist_node != HistMakerTrainParam::DefaultNodes()) {
       LOG(WARNING) << "The `max_cached_hist_node` is ignored in GPU.";
     }
-    dh::CheckComputeCapability();
+    common::CheckComputeCapability();
     initialised_ = false;
 
     monitor_.Init(this->Name());
diff --git a/tests/buildkite/build-manylinux2014-aarch64.sh b/tests/buildkite/build-manylinux2014-aarch64.sh
deleted file mode 100644
index 802db3f66aaf..000000000000
--- a/tests/buildkite/build-manylinux2014-aarch64.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-set -euo pipefail
-
-source tests/buildkite/conftest.sh
-
-WHEEL_TAG=manylinux2014_aarch64
-command_wrapper="tests/ci_build/ci_build.sh manylinux2014_aarch64"
-python_bin="/opt/python/cp310-cp310/bin/python"
-
-echo "--- Build binary wheel for ${WHEEL_TAG}"
-# Patch to add warning about manylinux2014 variant
-patch -p0 < tests/buildkite/manylinux2014_warning.patch
-$command_wrapper bash -c \
-  "cd python-package && ${python_bin} -m pip wheel --no-deps -vvv . --wheel-dir dist/"
-git checkout python-package/xgboost/core.py  # discard the patch
-
-$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl
-$command_wrapper ${python_bin} tests/ci_build/rename_whl.py  \
-  --wheel-path wheelhouse/*.whl  \
-  --commit-hash ${BUILDKITE_COMMIT}  \
-  --platform-tag ${WHEEL_TAG}
-rm -rf python-package/dist/
-mkdir python-package/dist/
-mv -v wheelhouse/*.whl python-package/dist/
-
-echo "--- Upload Python wheel"
-buildkite-agent artifact upload python-package/dist/*.whl
-if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
-then
-  aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \
-    --acl public-read --no-progress
-fi
diff --git a/tests/buildkite/build-manylinux2014-x86_64.sh b/tests/buildkite/build-manylinux2014-x86_64.sh
deleted file mode 100644
index b00616315b8d..000000000000
--- a/tests/buildkite/build-manylinux2014-x86_64.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-set -euo pipefail
-
-source tests/buildkite/conftest.sh
-
-WHEEL_TAG=manylinux2014_x86_64
-command_wrapper="tests/ci_build/ci_build.sh manylinux2014_x86_64"
-python_bin="/opt/python/cp310-cp310/bin/python"
-
-echo "--- Build binary wheel for ${WHEEL_TAG}"
-# Patch to add warning about manylinux2014 variant
-patch -p0 < tests/buildkite/manylinux2014_warning.patch
-$command_wrapper bash -c \
-  "cd python-package && ${python_bin} -m pip wheel --no-deps -vvv . --wheel-dir dist/"
-git checkout python-package/xgboost/core.py  # discard the patch
-
-$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl
-$command_wrapper ${python_bin} tests/ci_build/rename_whl.py  \
-  --wheel-path wheelhouse/*.whl  \
-  --commit-hash ${BUILDKITE_COMMIT}  \
-  --platform-tag ${WHEEL_TAG}
-rm -rf python-package/dist/
-mkdir python-package/dist/
-mv -v wheelhouse/*.whl python-package/dist/
-
-echo "--- Upload Python wheel"
-buildkite-agent artifact upload python-package/dist/*.whl
-if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
-then
-  aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \
-    --acl public-read --no-progress
-fi
diff --git a/tests/buildkite/build-manylinux2014.sh b/tests/buildkite/build-manylinux2014.sh
new file mode 100755
index 000000000000..426d32b5c361
--- /dev/null
+++ b/tests/buildkite/build-manylinux2014.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+set -euo pipefail
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 {x86_64,aarch64}"
+  exit 1
+fi
+
+arch=$1
+
+source tests/buildkite/conftest.sh
+
+WHEEL_TAG="manylinux2014_${arch}"
+command_wrapper="tests/ci_build/ci_build.sh ${WHEEL_TAG}"
+python_bin="/opt/python/cp310-cp310/bin/python"
+
+echo "--- Build binary wheel for ${WHEEL_TAG}"
+# Patch to add warning about manylinux2014 variant
+patch -p0 < tests/buildkite/remove_nccl_dep.patch
+patch -p0 < tests/buildkite/manylinux2014_warning.patch
+$command_wrapper bash -c \
+  "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/"
+git checkout python-package/pyproject.toml python-package/xgboost/core.py  # discard the patch
+
+$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl
+$command_wrapper ${python_bin} tests/ci_build/rename_whl.py  \
+  --wheel-path wheelhouse/*.whl  \
+  --commit-hash ${BUILDKITE_COMMIT}  \
+  --platform-tag ${WHEEL_TAG}
+rm -rf python-package/dist/
+mkdir python-package/dist/
+mv -v wheelhouse/*.whl python-package/dist/
+
+echo "--- Build binary wheel for ${WHEEL_TAG} (CPU only)"
+# Patch to rename pkg to xgboost-cpu
+patch -p0 < tests/buildkite/remove_nccl_dep.patch
+patch -p0 < tests/buildkite/cpu_only_pypkg.patch
+$command_wrapper bash -c \
+  "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/"
+git checkout python-package/pyproject.toml  # discard the patch
+
+$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/xgboost_cpu-*.whl
+$command_wrapper ${python_bin} tests/ci_build/rename_whl.py  \
+  --wheel-path wheelhouse/xgboost_cpu-*.whl  \
+  --commit-hash ${BUILDKITE_COMMIT}  \
+  --platform-tag ${WHEEL_TAG}
+rm -v python-package/dist/xgboost_cpu-*.whl
+mv -v wheelhouse/xgboost_cpu-*.whl python-package/dist/
+
+echo "--- Upload Python wheel"
+for wheel in python-package/dist/*.whl
+do
+  buildkite-agent artifact upload "${wheel}"
+done
+if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
+then
+  for wheel in python-package/dist/*.whl
+  do
+    aws s3 cp "${wheel}" s3://xgboost-nightly-builds/${BRANCH_NAME}/ \
+      --acl public-read --no-progress
+  done
+fi
diff --git a/tests/buildkite/cpu_only_pypkg.patch b/tests/buildkite/cpu_only_pypkg.patch
new file mode 100644
index 000000000000..765ac5c098d0
--- /dev/null
+++ b/tests/buildkite/cpu_only_pypkg.patch
@@ -0,0 +1,55 @@
+diff --git python-package/README.rst python-package/README.rst
+index 1fc0bb5a0..f1c68470b 100644
+--- python-package/README.rst
++++ python-package/README.rst
+@@ -1,20 +1,15 @@
+-======================
+-XGBoost Python Package
+-======================
++=================================
++XGBoost Python Package (CPU only)
++=================================
+ 
+ |PyPI version|
+ 
+-Installation
+-============
++The ``xgboost-cpu`` package provides for a minimal installation, with no support for the GPU algorithms
++or federated learning. It is provided to allow XGBoost to be installed in a space-constrained
++environments.
+ 
+-From `PyPI <https://pypi.python.org/pypi/xgboost>`_
+----------------------------------------------------
++Note. ``xgboost-cpu`` package is only provided for x86_64 (amd64) Linux and Windows platforms.
++For other platforms, please install ``xgboost`` from https://pypi.org/project/xgboost/.
+ 
+-For a stable version, install using ``pip``::
+-
+-    pip install xgboost
+-
+-.. |PyPI version| image:: https://badge.fury.io/py/xgboost.svg
+-   :target: http://badge.fury.io/py/xgboost
+-
+-For building from source, see `build <https://xgboost.readthedocs.io/en/latest/build.html>`_.
++Note. ``xgboost-cpu`` does not provide an sdist (source distribution). You may install sdist
++from https://pypi.org/project/xgboost/.
+diff --git python-package/pyproject.toml python-package/pyproject.toml
+index 46c1451c2..c5dc908d9 100644
+--- python-package/pyproject.toml
++++ python-package/pyproject.toml
+@@ -6,7 +6,7 @@ backend-path = ["."]
+ build-backend = "packager.pep517"
+ 
+ [project]
+-name = "xgboost"
++name = "xgboost-cpu"
+ description = "XGBoost Python Package"
+ readme = { file = "README.rst", content-type = "text/x-rst" }
+ authors = [
+@@ -82,3 +82,6 @@ class-attribute-naming-style = "snake_case"
+ 
+ # Allow single-letter variables
+ variable-rgx = "[a-zA-Z_][a-z0-9_]{0,30}$"
++
++[tool.hatch.build.targets.wheel]
++packages = ["xgboost/"]
diff --git a/tests/buildkite/manylinux2014_warning.patch b/tests/buildkite/manylinux2014_warning.patch
index 692a92672d2f..679205988b7a 100644
--- a/tests/buildkite/manylinux2014_warning.patch
+++ b/tests/buildkite/manylinux2014_warning.patch
@@ -1,17 +1,3 @@
-diff --git python-package/pyproject.toml python-package/pyproject.toml
-index a273d8c13..dee49686a 100644
---- python-package/pyproject.toml
-+++ python-package/pyproject.toml
-@@ -30,8 +30,7 @@ classifiers = [
- ]
- dependencies = [
-     "numpy",
--    "scipy",
--    "nvidia-nccl-cu12 ; platform_system == 'Linux' and platform_machine != 'aarch64'"
-+    "scipy"
- ]
- 
- [project.urls]
 diff --git python-package/xgboost/core.py python-package/xgboost/core.py
 index e8bc735e6..030972ef2 100644
 --- python-package/xgboost/core.py
diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml
index acdb71dba529..ee9637b8bd25 100644
--- a/tests/buildkite/pipeline.yml
+++ b/tests/buildkite/pipeline.yml
@@ -72,12 +72,12 @@ steps:
     agents:
       queue: linux-amd64-cpu
   - label: ":console: Build manylinux2014_x86_64 wheel"
-    command: "tests/buildkite/build-manylinux2014-x86_64.sh"
+    command: "tests/buildkite/build-manylinux2014.sh x86_64"
     key: build-manylinux2014-x86_64
     agents:
       queue: linux-amd64-cpu
   - label: ":console: Build manylinux2014_aarch64 wheel"
-    command: "tests/buildkite/build-manylinux2014-aarch64.sh"
+    command: "tests/buildkite/build-manylinux2014.sh aarch64"
     key: build-manylinux2014-aarch64
     agents:
       queue: linux-arm64-cpu
diff --git a/tests/buildkite/remove_nccl_dep.patch b/tests/buildkite/remove_nccl_dep.patch
new file mode 100644
index 000000000000..a2a4a5c88289
--- /dev/null
+++ b/tests/buildkite/remove_nccl_dep.patch
@@ -0,0 +1,14 @@
+diff --git python-package/pyproject.toml python-package/pyproject.toml
+index 8835def25..46c1451c2 100644
+--- python-package/pyproject.toml
++++ python-package/pyproject.toml
+@@ -30,8 +30,7 @@ classifiers = [
+ ]
+ dependencies = [
+     "numpy",
+-    "scipy",
+-    "nvidia-nccl-cu12 ; platform_system == 'Linux' and platform_machine != 'aarch64'"
++    "scipy"
+ ]
+ 
+ [project.urls]
diff --git a/tests/ci_build/Dockerfile.i386 b/tests/ci_build/Dockerfile.i386
index d7c133e2aee4..a582a54020bb 100644
--- a/tests/ci_build/Dockerfile.i386
+++ b/tests/ci_build/Dockerfile.i386
@@ -1,7 +1,7 @@
 FROM i386/debian:sid
 
 ENV DEBIAN_FRONTEND noninteractive
-SHELL ["/bin/bash", "-c"]   # Use Bash as shell
+SHELL ["/bin/bash", "-c"]
 
 RUN \
     apt-get update && \
diff --git a/tests/ci_build/conda_env/macos_cpu_test.yml b/tests/ci_build/conda_env/macos_cpu_test.yml
index ce9ca4b1ba7c..5bca323af5f4 100644
--- a/tests/ci_build/conda_env/macos_cpu_test.yml
+++ b/tests/ci_build/conda_env/macos_cpu_test.yml
@@ -37,5 +37,5 @@ dependencies:
 - pyspark>=3.4.0
 - cloudpickle
 - pip:
+  - setuptools
   - sphinx_rtd_theme
-  - py-ubjson
diff --git a/tests/cpp/collective/test_worker.h b/tests/cpp/collective/test_worker.h
index 66c6ce9bf24e..2430911904e8 100644
--- a/tests/cpp/collective/test_worker.h
+++ b/tests/cpp/collective/test_worker.h
@@ -15,7 +15,7 @@
 #include "../../../src/collective/comm.h"
 #include "../../../src/collective/communicator-inl.h"  // for Init, Finalize
 #include "../../../src/collective/tracker.h"           // for GetHostAddress
-#include "../../../src/common/common.h"                // for AllVisibleGPUs
+#include "../../../src/common/cuda_rt_utils.h"         // for AllVisibleGPUs
 #include "../helpers.h"                                // for FileExists
 
 #if defined(XGBOOST_USE_FEDERATED)
diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu
index 95da4ef3f167..c6a8c0ab95ce 100644
--- a/tests/cpp/common/test_device_vector.cu
+++ b/tests/cpp/common/test_device_vector.cu
@@ -12,7 +12,7 @@ TEST(DeviceUVector, Basic) {
   std::int32_t verbosity{3};
   std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
   DeviceUVector<float> uvec;
-  uvec.Resize(12);
+  uvec.resize(12);
   auto peak = GlobalMemoryLogger().PeakMemory();
   auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size();
   ASSERT_EQ(peak, n_bytes);
diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu
index a0aa5fa11fce..c730390c37d8 100644
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -4,9 +4,10 @@
 #include <gtest/gtest.h>
 #include <thrust/equal.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <xgboost/host_device_vector.h>
 
+#include "../../../src/common/cuda_rt_utils.h"  // for SetDevice
 #include "../../../src/common/device_helpers.cuh"
-#include <xgboost/host_device_vector.h>
 
 namespace xgboost::common {
 namespace {
diff --git a/tests/cpp/common/test_ref_resource_view.cc b/tests/cpp/common/test_ref_resource_view.cc
index 9ae55fdec7f4..b201f69139d3 100644
--- a/tests/cpp/common/test_ref_resource_view.cc
+++ b/tests/cpp/common/test_ref_resource_view.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
@@ -16,17 +16,16 @@ TEST(RefResourceView, Basic) {
   std::size_t n_bytes = 1024;
   auto mem = std::make_shared<MallocResource>(n_bytes);
   {
-    RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem};
+    RefResourceView view{static_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem};
 
-    RefResourceView kview{reinterpret_cast<float const*>(mem->Data()), mem->Size() / sizeof(float),
-                          mem};
+    RefResourceView kview{static_cast<float const*>(mem->Data()), mem->Size() / sizeof(float), mem};
     ASSERT_EQ(mem.use_count(), 3);
     ASSERT_EQ(view.size(), n_bytes / sizeof(1024));
     ASSERT_EQ(kview.size(), n_bytes / sizeof(1024));
   }
   {
-    RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem,
-                         1.5f};
+    RefResourceView view{static_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem};
+    std::fill_n(static_cast<float*>(mem->Data()), mem->Size() / sizeof(float), 1.5f);
     for (auto v : view) {
       ASSERT_EQ(v, 1.5f);
     }
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
index 9d9687dda81b..8aab51b7202e 100644
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -27,15 +27,15 @@ TEST(EllpackPage, EmptyDMatrix) {
   auto impl = page.Impl();
   ASSERT_EQ(impl->row_stride, 0);
   ASSERT_EQ(impl->Cuts().TotalBins(), 0);
-  ASSERT_EQ(impl->gidx_buffer.Size(), 4);
+  ASSERT_EQ(impl->gidx_buffer.size(), 4);
 }
 
 TEST(EllpackPage, BuildGidxDense) {
   int constexpr kNRows = 16, kNCols = 8;
-  auto page = BuildEllpackPage(kNRows, kNCols);
-
-  std::vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.HostVector());
-  common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), page->NumSymbols());
+  auto ctx = MakeCUDACtx(0);
+  auto page = BuildEllpackPage(&ctx, kNRows, kNCols);
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer);
 
   ASSERT_EQ(page->row_stride, kNCols);
 
@@ -58,16 +58,17 @@ TEST(EllpackPage, BuildGidxDense) {
     1, 4, 7, 10, 14, 16, 19, 21,
   };
   for (size_t i = 0; i < kNRows * kNCols; ++i) {
-    ASSERT_EQ(solution[i], gidx[i]);
+    ASSERT_EQ(solution[i], h_accessor.gidx_iter[i]);
   }
 }
 
 TEST(EllpackPage, BuildGidxSparse) {
   int constexpr kNRows = 16, kNCols = 8;
-  auto page = BuildEllpackPage(kNRows, kNCols, 0.9f);
+  auto ctx = MakeCUDACtx(0);
+  auto page = BuildEllpackPage(&ctx, kNRows, kNCols, 0.9f);
 
-  std::vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.HostVector());
-  common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer);
 
   ASSERT_LE(page->row_stride, 3);
 
@@ -78,7 +79,7 @@ TEST(EllpackPage, BuildGidxSparse) {
     24,  7, 14, 16,  4, 24, 24, 24, 24, 24,  9, 24, 24,  1, 24, 24
   };
   for (size_t i = 0; i < kNRows * page->row_stride; ++i) {
-    ASSERT_EQ(solution[i], gidx[i]);
+    ASSERT_EQ(solution[i], h_accessor.gidx_iter[i]);
   }
 }
 
@@ -94,7 +95,7 @@ TEST(EllpackPage, FromCategoricalBasic) {
   Context ctx{MakeCUDACtx(0)};
   auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()};
   auto ellpack = EllpackPage(&ctx, m.get(), p);
-  auto accessor = ellpack.Impl()->GetDeviceAccessor(FstCU());
+  auto accessor = ellpack.Impl()->GetDeviceAccessor(ctx.Device());
   ASSERT_EQ(kCats, accessor.NumBins());
 
   auto x_copy = x;
@@ -110,13 +111,11 @@ TEST(EllpackPage, FromCategoricalBasic) {
   ASSERT_EQ(h_cuts_ptr.size(), 2);
   ASSERT_EQ(h_cuts_values.size(), kCats);
 
-  std::vector<common::CompressedByteT> const &h_gidx_buffer =
-      ellpack.Impl()->gidx_buffer.HostVector();
-  auto h_gidx_iter = common::CompressedIterator<uint32_t>(
-      h_gidx_buffer.data(), accessor.NumSymbols());
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  auto h_accessor = ellpack.Impl()->GetHostAccessor(&ctx, &h_gidx_buffer);
 
   for (size_t i = 0; i < x.size(); ++i) {
-    auto bin = h_gidx_iter[i];
+    auto bin = h_accessor.gidx_iter[i];
     auto bin_value = h_cuts_values.at(bin);
     ASSERT_EQ(AsCat(x[i]), AsCat(bin_value));
   }
@@ -152,12 +151,12 @@ TEST(EllpackPage, Copy) {
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
 
   // Create an empty result page.
-  EllpackPageImpl result(FstCU(), page->CutsShared(), page->is_dense, page->row_stride, kRows);
+  EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->row_stride, kRows);
 
   // Copy batch pages into the result page.
   size_t offset = 0;
   for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
-    size_t num_elements = result.Copy(FstCU(), batch.Impl(), offset);
+    size_t num_elements = result.Copy(&ctx, batch.Impl(), offset);
     offset += num_elements;
   }
 
@@ -171,11 +170,11 @@ TEST(EllpackPage, Copy) {
     EXPECT_EQ(impl->base_rowid, current_row);
 
     for (size_t i = 0; i < impl->Size(); i++) {
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()), current_row,
+      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
                                          row_d.data().get()));
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
-      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(FstCU()), current_row,
+      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(ctx.Device()), current_row,
                                          row_result_d.data().get()));
       thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
 
@@ -200,7 +199,7 @@ TEST(EllpackPage, Compact) {
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
 
   // Create an empty result page.
-  EllpackPageImpl result(ctx.Device(), page->CutsShared(), page->is_dense, page->row_stride,
+  EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->row_stride,
                          kCompactedRows);
 
   // Compact batch pages into the result page.
@@ -229,14 +228,13 @@ TEST(EllpackPage, Compact) {
         continue;
       }
 
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()),
-                                         current_row, row_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
+                                         row_d.data().get()));
       dh::safe_cuda(cudaDeviceSynchronize());
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
-      dh::LaunchN(kCols,
-                  ReadRowFunction(result.GetDeviceAccessor(FstCU()), compacted_row,
-                                  row_result_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(ctx.Device()), compacted_row,
+                                         row_result_d.data().get()));
       thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
 
       EXPECT_EQ(row, row_result);
@@ -269,16 +267,13 @@ class EllpackPageTest : public testing::TestWithParam<float> {
       ASSERT_EQ(from_sparse_page->base_rowid, 0);
       ASSERT_EQ(from_sparse_page->base_rowid, from_ghist->base_rowid);
       ASSERT_EQ(from_sparse_page->n_rows, from_ghist->n_rows);
-      ASSERT_EQ(from_sparse_page->gidx_buffer.Size(), from_ghist->gidx_buffer.Size());
-      auto const& h_gidx_from_sparse = from_sparse_page->gidx_buffer.HostVector();
-      auto const& h_gidx_from_ghist = from_ghist->gidx_buffer.HostVector();
+      ASSERT_EQ(from_sparse_page->gidx_buffer.size(), from_ghist->gidx_buffer.size());
+      std::vector<common::CompressedByteT> h_gidx_from_sparse, h_gidx_from_ghist;
+      auto from_ghist_acc = from_ghist->GetHostAccessor(&gpu_ctx, &h_gidx_from_ghist);
+      auto from_sparse_acc = from_sparse_page->GetHostAccessor(&gpu_ctx, &h_gidx_from_sparse);
       ASSERT_EQ(from_sparse_page->NumSymbols(), from_ghist->NumSymbols());
-      common::CompressedIterator<uint32_t> from_ghist_it(h_gidx_from_ghist.data(),
-                                                         from_ghist->NumSymbols());
-      common::CompressedIterator<uint32_t> from_sparse_it(h_gidx_from_sparse.data(),
-                                                          from_sparse_page->NumSymbols());
       for (size_t i = 0; i < from_ghist->n_rows * from_ghist->row_stride; ++i) {
-        EXPECT_EQ(from_ghist_it[i], from_sparse_it[i]);
+        EXPECT_EQ(from_ghist_acc.gidx_iter[i], from_sparse_acc.gidx_iter[i]);
       }
     }
   }
diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu
index d5ff721f869a..b7bb5f902c6c 100644
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -14,9 +14,8 @@
 namespace xgboost::data {
 namespace {
 template <typename FormatStreamPolicy>
-void TestEllpackPageRawFormat() {
-  FormatStreamPolicy policy;
-
+void TestEllpackPageRawFormat(FormatStreamPolicy *p_policy) {
+  auto &policy = *p_policy;
   Context ctx{MakeCUDACtx(0)};
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
 
@@ -55,16 +54,30 @@ void TestEllpackPageRawFormat() {
     ASSERT_EQ(loaded->Cuts().Values(), orig->Cuts().Values());
     ASSERT_EQ(loaded->base_rowid, orig->base_rowid);
     ASSERT_EQ(loaded->row_stride, orig->row_stride);
-    ASSERT_EQ(loaded->gidx_buffer.HostVector(), orig->gidx_buffer.HostVector());
+    std::vector<common::CompressedByteT> h_loaded, h_orig;
+    [[maybe_unused]] auto h_loaded_acc = loaded->GetHostAccessor(&ctx, &h_loaded);
+    [[maybe_unused]] auto h_orig_acc = orig->GetHostAccessor(&ctx, &h_orig);
+    ASSERT_EQ(h_loaded, h_orig);
   }
 }
 }  // anonymous namespace
 
 TEST(EllpackPageRawFormat, DiskIO) {
-  TestEllpackPageRawFormat<DefaultFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>();
+  EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy> policy{false};
+  TestEllpackPageRawFormat(&policy);
+}
+
+TEST(EllpackPageRawFormat, DiskIOHmm) {
+  if (common::SupportsPageableMem()) {
+    EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy> policy{true};
+    TestEllpackPageRawFormat(&policy);
+  } else {
+    GTEST_SKIP_("HMM is not supported.");
+  }
 }
 
 TEST(EllpackPageRawFormat, HostIO) {
-  TestEllpackPageRawFormat<EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>();
+  EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy> policy;
+  TestEllpackPageRawFormat(&policy);
 }
 }  // namespace xgboost::data
diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu
index 503cb76965e6..5fb90a5c1526 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023, XGBoost contributors
+ * Copyright 2020-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 
@@ -21,10 +21,10 @@ void TestEquivalent(float sparsity) {
   std::size_t offset = 0;
   auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl();
   std::unique_ptr<EllpackPageImpl> page_concatenated{new EllpackPageImpl(
-      ctx.Device(), first->CutsShared(), first->is_dense, first->row_stride, 1000 * 100)};
+      &ctx, first->CutsShared(), first->is_dense, first->row_stride, 1000 * 100)};
   for (auto& batch : m.GetBatches<EllpackPage>(&ctx, {})) {
     auto page = batch.Impl();
-    size_t num_elements = page_concatenated->Copy(ctx.Device(), page, offset);
+    size_t num_elements = page_concatenated->Copy(&ctx, page, offset);
     offset += num_elements;
   }
   auto from_iter = page_concatenated->GetDeviceAccessor(ctx.Device());
@@ -66,18 +66,15 @@ void TestEquivalent(float sparsity) {
       ASSERT_EQ(cut_ptrs_iter[i], cut_ptrs_data[i]);
     }
 
-    auto const& buffer_from_iter = page_concatenated->gidx_buffer;
-    auto const& buffer_from_data = ellpack.Impl()->gidx_buffer;
-    ASSERT_NE(buffer_from_data.Size(), 0);
-
-    common::CompressedIterator<uint32_t> data_buf{
-        buffer_from_data.ConstHostPointer(), from_data.NumSymbols()};
-    common::CompressedIterator<uint32_t> data_iter{
-        buffer_from_iter.ConstHostPointer(), from_iter.NumSymbols()};
+    std::vector<common::CompressedByteT> buffer_from_iter, buffer_from_data;
+    auto data_iter = page_concatenated->GetHostAccessor(&ctx, &buffer_from_iter);
+    auto data_buf = ellpack.Impl()->GetHostAccessor(&ctx, &buffer_from_data);
+    ASSERT_NE(buffer_from_data.size(), 0);
+    ASSERT_NE(buffer_from_iter.size(), 0);
     CHECK_EQ(from_data.NumSymbols(), from_iter.NumSymbols());
     CHECK_EQ(from_data.n_rows * from_data.row_stride, from_data.n_rows * from_iter.row_stride);
     for (size_t i = 0; i < from_data.n_rows * from_data.row_stride; ++i) {
-      CHECK_EQ(data_buf[i], data_iter[i]);
+      CHECK_EQ(data_buf.gidx_iter[i], data_iter.gidx_iter[i]);
     }
   }
 }
@@ -97,8 +94,8 @@ TEST(IterativeDeviceDMatrix, RowMajor) {
   for (auto& ellpack : m.GetBatches<EllpackPage>(&ctx, {})) {
     n_batches ++;
     auto impl = ellpack.Impl();
-    common::CompressedIterator<uint32_t> iterator(
-        impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
+    std::vector<common::CompressedByteT> h_gidx;
+    auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx);
     auto cols = CudaArrayIterForTest::Cols();
     auto rows = CudaArrayIterForTest::Rows();
 
@@ -111,7 +108,7 @@ TEST(IterativeDeviceDMatrix, RowMajor) {
 
     for(auto i = 0ull; i < rows * cols; i++) {
       int column_idx = i % cols;
-      EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx), iterator[i]);
+      EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx), h_accessor.gidx_iter[i]);
     }
     EXPECT_EQ(m.Info().num_col_, cols);
     EXPECT_EQ(m.Info().num_row_, rows);
@@ -147,12 +144,12 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
       *m.GetBatches<EllpackPage>(&ctx, BatchParam{256, tree::TrainParam::DftSparseThreshold()})
            .begin();
   auto impl = ellpack.Impl();
-  common::CompressedIterator<uint32_t> iterator(
-      impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
-  EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(ctx.Device()).NullValue());
-  EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(ctx.Device()).NullValue());
+  std::vector<common::CompressedByteT> h_gidx;
+  auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx);
+  EXPECT_EQ(h_accessor.gidx_iter[1], impl->GetDeviceAccessor(ctx.Device()).NullValue());
+  EXPECT_EQ(h_accessor.gidx_iter[5], impl->GetDeviceAccessor(ctx.Device()).NullValue());
   // null values get placed after valid values in a row
-  EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(ctx.Device()).NullValue());
+  EXPECT_EQ(h_accessor.gidx_iter[7], impl->GetDeviceAccessor(ctx.Device()).NullValue());
   EXPECT_EQ(m.Info().num_col_, cols);
   EXPECT_EQ(m.Info().num_row_, rows);
   EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3);
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index 327f2ba635fd..046c4eed4d80 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -154,13 +154,18 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
   for (auto it = begin; it != end; ++it) {
     iterators.push_back(it.Page());
     gidx_buffers.emplace_back();
-    gidx_buffers.back().Resize((*it).Impl()->gidx_buffer.Size());
-    gidx_buffers.back().Copy((*it).Impl()->gidx_buffer);
+    gidx_buffers.back().SetDevice(ctx.Device());
+    gidx_buffers.back().Resize((*it).Impl()->gidx_buffer.size());
+    auto d_dst = gidx_buffers.back().DevicePointer();
+    auto const& d_src = (*it).Impl()->gidx_buffer;
+    dh::safe_cuda(cudaMemcpyAsync(d_dst, d_src.data(), d_src.size_bytes(), cudaMemcpyDefault));
   }
   ASSERT_GE(iterators.size(), 2);
 
   for (size_t i = 0; i < iterators.size(); ++i) {
-    ASSERT_EQ((*iterators[i]).Impl()->gidx_buffer.HostVector(), gidx_buffers.at(i).HostVector());
+    std::vector<common::CompressedByteT> h_buf;
+    [[maybe_unused]] auto h_acc = (*iterators[i]).Impl()->GetHostAccessor(&ctx, &h_buf);
+    ASSERT_EQ(h_buf, gidx_buffers.at(i).HostVector());
     ASSERT_EQ(iterators[i].use_count(), 1);
   }
 
@@ -210,11 +215,11 @@ class TestEllpackPageExt : public ::testing::TestWithParam<std::tuple<bool, bool
     size_t offset = 0;
     for (auto& batch : p_ext_fmat->GetBatches<EllpackPage>(&ctx, param)) {
       if (!impl_ext) {
-        impl_ext = std::make_unique<EllpackPageImpl>(
-            batch.Impl()->gidx_buffer.Device(), batch.Impl()->CutsShared(), batch.Impl()->is_dense,
-            batch.Impl()->row_stride, kRows);
+        impl_ext = std::make_unique<EllpackPageImpl>(&ctx, batch.Impl()->CutsShared(),
+                                                     batch.Impl()->is_dense,
+                                                     batch.Impl()->row_stride, kRows);
       }
-      auto n_elems = impl_ext->Copy(ctx.Device(), batch.Impl(), offset);
+      auto n_elems = impl_ext->Copy(&ctx, batch.Impl(), offset);
       offset += n_elems;
     }
     ASSERT_EQ(impl_ext->base_rowid, 0);
@@ -223,8 +228,10 @@ class TestEllpackPageExt : public ::testing::TestWithParam<std::tuple<bool, bool
     ASSERT_EQ(impl_ext->row_stride, 2);
     ASSERT_EQ(impl_ext->Cuts().TotalBins(), 4);
 
-    std::vector<common::CompressedByteT> buffer(impl->gidx_buffer.HostVector());
-    std::vector<common::CompressedByteT> buffer_ext(impl_ext->gidx_buffer.HostVector());
+    std::vector<common::CompressedByteT> buffer;
+    [[maybe_unused]] auto h_acc = impl->GetHostAccessor(&ctx, &buffer);
+    std::vector<common::CompressedByteT> buffer_ext;
+    [[maybe_unused]] auto h_ext_acc = impl_ext->GetHostAccessor(&ctx, &buffer_ext);
     ASSERT_EQ(buffer, buffer_ext);
   }
 };
diff --git a/tests/cpp/filesystem.h b/tests/cpp/filesystem.h
index c8d144291b0f..fafc8c7d1bf9 100644
--- a/tests/cpp/filesystem.h
+++ b/tests/cpp/filesystem.h
@@ -1,13 +1,10 @@
-/*!
- * Copyright (c) 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2024, XGBoost Contributors
  */
 #ifndef XGBOOST_TESTS_CPP_FILESYSTEM_H
 #define XGBOOST_TESTS_CPP_FILESYSTEM_H
 
-// A macro used inside `windows.h` to avoid conflicts with `winsock2.h`
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif  // WIN32_LEAN_AND_MEAN
+#include <xgboost/windefs.h>
 
 #include "dmlc/filesystem.h"
 
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 9b988f9605bd..eebbaf8ef795 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -763,4 +763,6 @@ void DeleteRMMResource(RMMAllocator*) {}
 
 RMMAllocatorPtr SetUpRMMResourceForCppTests(int, char**) { return {nullptr, DeleteRMMResource}; }
 #endif  // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1
+
+std::int32_t DistGpuIdx() { return common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); }
 } // namespace xgboost
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 2211b2d00cb2..2821a11380c8 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -21,14 +21,11 @@
 
 #if defined(__CUDACC__)
 #include "../../src/collective/communicator-inl.h"  // for GetRank
-#include "../../src/common/common.h"                // for AllVisibleGPUs
+#include "../../src/common/cuda_rt_utils.h"         // for AllVisibleGPUs
 #endif  // defined(__CUDACC__)
 
 #include "filesystem.h"  // dmlc::TemporaryDirectory
 #include "xgboost/linalg.h"
-#if !defined(_OPENMP)
-#include <thread>
-#endif
 
 #if defined(__CUDACC__)
 #define DeclareUnifiedTest(name) GPU ## name
@@ -529,6 +526,9 @@ inline std::int32_t AllThreadsForTest() { return Context{}.Threads(); }
 
 inline DeviceOrd FstCU() { return DeviceOrd::CUDA(0); }
 
+// GPU device ordinal for distributed tests
+std::int32_t DistGpuIdx();
+
 inline auto GMockThrow(StringView msg) {
   return ::testing::ThrowsMessage<dmlc::Error>(::testing::HasSubstr(msg));
 }
diff --git a/tests/cpp/histogram_helpers.h b/tests/cpp/histogram_helpers.h
index a33d6958ae3f..ff021e819821 100644
--- a/tests/cpp/histogram_helpers.h
+++ b/tests/cpp/histogram_helpers.h
@@ -23,7 +23,7 @@ class HistogramCutsWrapper : public common::HistogramCuts {
 };
 }  // namespace detail
 
-inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(int n_rows, int n_cols,
+inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(Context const* ctx, int n_rows, int n_cols,
                                                          bst_float sparsity = 0) {
   auto dmat = RandomDataGenerator(n_rows, n_cols, sparsity).Seed(3).GenerateDMatrix();
   const SparsePage& batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
@@ -48,7 +48,7 @@ inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(int n_rows, int n_cols,
   }
 
   auto page = std::unique_ptr<EllpackPageImpl>(
-      new EllpackPageImpl(DeviceOrd::CUDA(0), cmat, batch, dmat->IsDense(), row_stride, {}));
+      new EllpackPageImpl(ctx, cmat, batch, dmat->IsDense(), row_stride, {}));
 
   return page;
 }
diff --git a/tests/cpp/objective/test_aft_obj.cc b/tests/cpp/objective/test_aft_obj.cc
index 972dfc53f58e..f31debb21af9 100644
--- a/tests/cpp/objective/test_aft_obj.cc
+++ b/tests/cpp/objective/test_aft_obj.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023, XGBoost Contributors 
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <memory>
@@ -10,7 +10,6 @@
 #include "xgboost/objective.h"
 #include "xgboost/logging.h"
 #include "../helpers.h"
-#include "../../../src/common/survival_util.h"
 
 namespace xgboost::common {
 TEST(Objective, DeclareUnifiedTest(AFTObjConfiguration)) {
diff --git a/tests/cpp/plugin/federated/test_federated_coll.cu b/tests/cpp/plugin/federated/test_federated_coll.cu
index f3b9066133cc..31760a97f1fe 100644
--- a/tests/cpp/plugin/federated/test_federated_coll.cu
+++ b/tests/cpp/plugin/federated/test_federated_coll.cu
@@ -6,7 +6,7 @@
 #include <xgboost/collective/result.h>  // for Result
 
 #include "../../../../src/collective/allreduce.h"
-#include "../../../../src/common/common.h"            // for AllVisibleGPUs
+#include "../../../../src/common/cuda_rt_utils.h"     // for AllVisibleGPUs
 #include "../../../../src/common/device_helpers.cuh"  // for device_vector
 #include "../../../../src/common/type.h"              // for EraseType
 #include "../../collective/test_worker.h"             // for SocketTest
diff --git a/tests/cpp/plugin/federated/test_federated_comm_group.cc b/tests/cpp/plugin/federated/test_federated_comm_group.cc
index 9bfbdd3ae1b4..511b3d8d11a8 100644
--- a/tests/cpp/plugin/federated/test_federated_comm_group.cc
+++ b/tests/cpp/plugin/federated/test_federated_comm_group.cc
@@ -1,11 +1,11 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/json.h>  // for Json
 
 #include "../../../../src/collective/comm_group.h"
-#include "../../helpers.h"
+#include "../../../../src/common/cuda_rt_utils.h"  // for AllVisibleGPUs
 #include "test_worker.h"
 
 namespace xgboost::collective {
diff --git a/tests/cpp/plugin/federated/test_federated_comm_group.cu b/tests/cpp/plugin/federated/test_federated_comm_group.cu
index 747adb6fd87e..c6fd8921c0bb 100644
--- a/tests/cpp/plugin/federated/test_federated_comm_group.cu
+++ b/tests/cpp/plugin/federated/test_federated_comm_group.cu
@@ -1,10 +1,11 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/json.h>  // for Json
 
 #include "../../../../src/collective/comm_group.h"
+#include "../../../../src/common/cuda_rt_utils.h"  // for AllVisibleGPUs
 #include "../../helpers.h"
 #include "test_worker.h"
 
diff --git a/tests/cpp/test_context.cu b/tests/cpp/test_context.cu
index 7684ff4672cd..0776980353f1 100644
--- a/tests/cpp/test_context.cu
+++ b/tests/cpp/test_context.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/base.h>  // for Args
@@ -8,7 +8,7 @@
 
 #include <string>  // for string, to_string
 
-#include "../../src/common/common.h"  // for AllVisibleGPUs
+#include "../../src/common/cuda_rt_utils.h"  // for AllVisibleGPUs
 
 namespace xgboost {
 namespace {
diff --git a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
index 9a0304f87d58..85bea39c5f5c 100644
--- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
+++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023, XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
@@ -102,19 +102,17 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
   EXPECT_EQ(sample.gpair.data(), gpair.DevicePointer());
   EXPECT_EQ(sampled_page->n_rows, kRows);
 
-  std::vector<common::CompressedByteT> buffer(sampled_page->gidx_buffer.HostVector());
-  common::CompressedIterator<common::CompressedByteT>
-      ci(buffer.data(), sampled_page->NumSymbols());
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  auto h_accessor = sampled_page->GetHostAccessor(&ctx, &h_gidx_buffer);
 
-  size_t offset = 0;
+  std::size_t offset = 0;
   for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
     auto page = batch.Impl();
-    std::vector<common::CompressedByteT> page_buffer(page->gidx_buffer.HostVector());
-    common::CompressedIterator<common::CompressedByteT>
-        page_ci(page_buffer.data(), page->NumSymbols());
+    std::vector<common::CompressedByteT> h_page_gidx_buffer;
+    auto page_accessor = page->GetHostAccessor(&ctx, &h_page_gidx_buffer);
     size_t num_elements = page->n_rows * page->row_stride;
     for (size_t i = 0; i < num_elements; i++) {
-      EXPECT_EQ(ci[i + offset], page_ci[i]);
+      EXPECT_EQ(h_accessor.gidx_iter[i + offset], page_accessor.gidx_iter[i]);
     }
     offset += num_elements;
   }
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index d1128446617b..c9320f616983 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -14,6 +14,46 @@
 #include "../../helpers.h"
 
 namespace xgboost::tree {
+TEST(Histogram, DeviceHistogramStorage) {
+  // Ensures that node allocates correctly after reaching `kStopGrowingSize`.
+  auto ctx = MakeCUDACtx(0);
+  constexpr size_t kNBins = 128;
+  constexpr int kNNodes = 4;
+  constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
+  DeviceHistogramStorage<kStopGrowing> histogram;
+  histogram.Init(FstCU(), kNBins);
+  for (int i = 0; i < kNNodes; ++i) {
+    histogram.AllocateHistograms(&ctx, {i});
+  }
+  histogram.Reset(&ctx);
+  ASSERT_EQ(histogram.Data().size(), kStopGrowing);
+
+  // Use allocated memory but do not erase nidx_map.
+  for (int i = 0; i < kNNodes; ++i) {
+    histogram.AllocateHistograms(&ctx, {i});
+  }
+  for (int i = 0; i < kNNodes; ++i) {
+    ASSERT_TRUE(histogram.HistogramExists(i));
+  }
+
+  // Add two new nodes
+  histogram.AllocateHistograms(&ctx, {kNNodes});
+  histogram.AllocateHistograms(&ctx, {kNNodes + 1});
+
+  // Old cached nodes should still exist
+  for (int i = 0; i < kNNodes; ++i) {
+    ASSERT_TRUE(histogram.HistogramExists(i));
+  }
+
+  // Should be deleted
+  ASSERT_FALSE(histogram.HistogramExists(kNNodes));
+  // Most recent node should exist
+  ASSERT_TRUE(histogram.HistogramExists(kNNodes + 1));
+
+  // Add same node again - should fail
+  EXPECT_ANY_THROW(histogram.AllocateHistograms(&ctx, {kNNodes + 1}););
+}
+
 void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) {
   Context ctx = MakeCUDACtx(0);
   size_t constexpr kBins = 256, kCols = 120, kRows = 16384, kRounds = 16;
@@ -328,8 +368,7 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
       for (auto const& page : p_fmat->GetBatches<SparsePage>()) {
         concat.Push(page);
       }
-      EllpackPageImpl page{
-          ctx.Device(), cuts, concat, p_fmat->IsDense(), p_fmat->Info().num_col_, {}};
+      EllpackPageImpl page{&ctx, cuts, concat, p_fmat->IsDense(), p_fmat->Info().num_col_, {}};
       auto ridx = partitioner.GetRows(0);
       auto d_histogram = dh::ToSpan(single_hist);
       DeviceHistogramBuilder builder;
diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc
index 8f28bfa218c8..83e9243a2fa4 100644
--- a/tests/cpp/tree/test_approx.cc
+++ b/tests/cpp/tree/test_approx.cc
@@ -4,10 +4,12 @@
 #include <gtest/gtest.h>
 
 #include "../../../src/tree/common_row_partitioner.h"
+#include "../../../src/tree/param.h"    // for TrainParam
 #include "../collective/test_worker.h"  // for TestDistributedGlobal
 #include "../helpers.h"
 #include "test_column_split.h"  // for TestColumnSplit
 #include "test_partitioner.h"
+#include "xgboost/tree_model.h"  // for RegTree
 
 namespace xgboost::tree {
 namespace {
@@ -76,6 +78,53 @@ TEST(Approx, Partitioner) {
   }
 }
 
+TEST(Approx, InteractionConstraint) {
+  auto constexpr kRows = 32;
+  auto constexpr kCols = 16;
+  auto p_dmat = GenerateCatDMatrix(kRows, kCols, 0.6f, false);
+  Context ctx;
+
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+
+  ObjInfo task{ObjInfo::kRegression};
+  {
+    // With constraints
+    RegTree tree{1, kCols};
+
+    std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
+    TrainParam param;
+    param.UpdateAllowUnknown(
+        Args{{"interaction_constraints", "[[0, 1]]"}, {"num_feature", std::to_string(kCols)}});
+    std::vector<HostDeviceVector<bst_node_t>> position(1);
+    updater->Configure(Args{});
+    updater->Update(&param, &gpair, p_dmat.get(), position, {&tree});
+
+    ASSERT_EQ(tree.NumExtraNodes(), 4);
+    ASSERT_EQ(tree[0].SplitIndex(), 1);
+
+    ASSERT_EQ(tree[tree[0].LeftChild()].SplitIndex(), 0);
+    ASSERT_EQ(tree[tree[0].RightChild()].SplitIndex(), 0);
+  }
+  {
+    // Without constraints
+    RegTree tree{1u, kCols};
+
+    std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
+    std::vector<HostDeviceVector<bst_node_t>> position(1);
+    TrainParam param;
+    param.Init(Args{});
+    updater->Configure(Args{});
+    updater->Update(&param, &gpair, p_dmat.get(), position, {&tree});
+
+    ASSERT_EQ(tree.NumExtraNodes(), 10);
+    ASSERT_EQ(tree[0].SplitIndex(), 1);
+
+    ASSERT_NE(tree[tree[0].LeftChild()].SplitIndex(), 0);
+    ASSERT_NE(tree[tree[0].RightChild()].SplitIndex(), 0);
+  }
+}
+
 namespace {
 void TestColumnSplitPartitioner(size_t n_samples, size_t base_rowid, std::shared_ptr<DMatrix> Xy,
                                 std::vector<float>* hess, float min_value, float mid_value,
diff --git a/tests/cpp/tree/test_column_split.h b/tests/cpp/tree/test_column_split.h
index b03597f38681..eba452a15a1c 100644
--- a/tests/cpp/tree/test_column_split.h
+++ b/tests/cpp/tree/test_column_split.h
@@ -23,9 +23,13 @@ inline std::shared_ptr<DMatrix> GenerateCatDMatrix(std::size_t rows, std::size_t
     for (size_t i = 0; i < ft.size(); ++i) {
       ft[i] = (i % 3 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical;
     }
-    return RandomDataGenerator(rows, cols, 0.6f).Seed(3).Type(ft).MaxCategory(17).GenerateDMatrix();
+    return RandomDataGenerator(rows, cols, sparsity)
+        .Seed(3)
+        .Type(ft)
+        .MaxCategory(17)
+        .GenerateDMatrix();
   } else {
-    return RandomDataGenerator{rows, cols, 0.6f}.Seed(3).GenerateDMatrix();
+    return RandomDataGenerator{rows, cols, sparsity}.Seed(3).GenerateDMatrix();
   }
 }
 
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 200fb39fb4e9..edd129353bdf 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -6,14 +6,14 @@
 #include <thrust/host_vector.h>
 #include <xgboost/base.h>
 
-#include <random>
 #include <string>
 #include <vector>
 
 #include "../../../src/common/common.h"
-#include "../../../src/data/ellpack_page.cuh"  // for EllpackPageImpl
-#include "../../../src/data/ellpack_page.h"    // for EllpackPage
-#include "../../../src/tree/param.h"           // for TrainParam
+#include "../../../src/data/ellpack_page.cuh"        // for EllpackPageImpl
+#include "../../../src/data/ellpack_page.h"          // for EllpackPage
+#include "../../../src/tree/gpu_hist/histogram.cuh"  // for DeviceHistogramStorage
+#include "../../../src/tree/param.h"                 // for TrainParam
 #include "../../../src/tree/updater_gpu_hist.cu"
 #include "../collective/test_worker.h"  // for BaseMGPUTest
 #include "../filesystem.h"              // dmlc::TemporaryDirectory
@@ -23,46 +23,6 @@
 #include "xgboost/json.h"
 
 namespace xgboost::tree {
-TEST(GpuHist, DeviceHistogramStorage) {
-  // Ensures that node allocates correctly after reaching `kStopGrowingSize`.
-  dh::safe_cuda(cudaSetDevice(0));
-  constexpr size_t kNBins = 128;
-  constexpr int kNNodes = 4;
-  constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
-  DeviceHistogramStorage<kStopGrowing> histogram;
-  histogram.Init(FstCU(), kNBins);
-  for (int i = 0; i < kNNodes; ++i) {
-    histogram.AllocateHistograms({i});
-  }
-  histogram.Reset();
-  ASSERT_EQ(histogram.Data().size(), kStopGrowing);
-
-  // Use allocated memory but do not erase nidx_map.
-  for (int i = 0; i < kNNodes; ++i) {
-    histogram.AllocateHistograms({i});
-  }
-  for (int i = 0; i < kNNodes; ++i) {
-    ASSERT_TRUE(histogram.HistogramExists(i));
-  }
-
-  // Add two new nodes
-  histogram.AllocateHistograms({kNNodes});
-  histogram.AllocateHistograms({kNNodes + 1});
-
-  // Old cached nodes should still exist
-  for (int i = 0; i < kNNodes; ++i) {
-    ASSERT_TRUE(histogram.HistogramExists(i));
-  }
-
-  // Should be deleted
-  ASSERT_FALSE(histogram.HistogramExists(kNNodes));
-  // Most recent node should exist
-  ASSERT_TRUE(histogram.HistogramExists(kNNodes + 1));
-
-  // Add same node again - should fail
-  EXPECT_ANY_THROW(histogram.AllocateHistograms({kNNodes + 1}););
-}
-
 std::vector<GradientPairPrecise> GetHostHistGpair() {
   // 24 bins, 3 bins for each feature (column).
   std::vector<GradientPairPrecise> hist_gpair = {
@@ -81,6 +41,7 @@ std::vector<GradientPairPrecise> GetHostHistGpair() {
 template <typename GradientSumT>
 void TestBuildHist(bool use_shared_memory_histograms) {
   int const kNRows = 16, kNCols = 8;
+  Context ctx{MakeCUDACtx(0)};
 
   TrainParam param;
   Args args{
@@ -89,9 +50,8 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   };
   param.Init(args);
 
-  auto page = BuildEllpackPage(kNRows, kNCols);
+  auto page = BuildEllpackPage(&ctx, kNRows, kNCols);
   BatchParam batch_param{};
-  Context ctx{MakeCUDACtx(0)};
   auto cs = std::make_shared<common::ColumnSampler>(0);
   GPUHistMakerDevice maker(&ctx, /*is_external_memory=*/false, {}, kNRows, param, cs, kNCols,
                            batch_param, MetaInfo());
@@ -105,11 +65,10 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   }
   gpair.SetDevice(ctx.Device());
 
-  thrust::host_vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.HostVector());
   maker.row_partitioner = std::make_unique<RowPartitioner>(&ctx, kNRows, 0);
 
   maker.hist.Init(ctx.Device(), page->Cuts().TotalBins());
-  maker.hist.AllocateHistograms({0});
+  maker.hist.AllocateHistograms(&ctx, {0});
 
   maker.gpair = gpair.DeviceSpan();
   maker.quantiser = std::make_unique<GradientQuantiser>(&ctx, maker.gpair, MetaInfo());
@@ -198,14 +157,12 @@ void TestHistogramIndexImpl() {
   auto grad = GenerateRandomGradients(kNRows);
   grad.SetDevice(DeviceOrd::CUDA(0));
   maker->Reset(&grad, hist_maker_dmat.get(), kNCols);
-  std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.HostVector());
 
   const auto &maker_ext = hist_maker_ext.maker;
   maker_ext->Reset(&grad, hist_maker_ext_dmat.get(), kNCols);
-  std::vector<common::CompressedByteT> h_gidx_buffer_ext(maker_ext->page->gidx_buffer.HostVector());
 
   ASSERT_EQ(maker->page->Cuts().TotalBins(), maker_ext->page->Cuts().TotalBins());
-  ASSERT_EQ(maker->page->gidx_buffer.Size(), maker_ext->page->gidx_buffer.Size());
+  ASSERT_EQ(maker->page->gidx_buffer.size(), maker_ext->page->gidx_buffer.size());
 }
 
 TEST(GpuHist, TestHistogramIndex) {
@@ -428,8 +385,8 @@ TEST(GpuHist, MaxDepth) {
 namespace {
 RegTree GetHistTree(Context const* ctx, DMatrix* dmat) {
   ObjInfo task{ObjInfo::kRegression};
-  GPUHistMaker hist_maker{ctx, &task};
-  hist_maker.Configure(Args{});
+  std::unique_ptr<TreeUpdater> hist_maker {TreeUpdater::Create("grow_gpu_hist", ctx, &task)};
+  hist_maker->Configure(Args{});
 
   TrainParam param;
   param.UpdateAllowUnknown(Args{});
@@ -439,8 +396,8 @@ RegTree GetHistTree(Context const* ctx, DMatrix* dmat) {
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
   RegTree tree;
-  hist_maker.Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
-                    {&tree});
+  hist_maker->Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
+                     {&tree});
   return tree;
 }
 
@@ -479,8 +436,8 @@ TEST_F(MGPUHistTest, HistColumnSplit) {
 namespace {
 RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) {
   ObjInfo task{ObjInfo::kRegression};
-  GPUGlobalApproxMaker approx_maker{ctx, &task};
-  approx_maker.Configure(Args{});
+  std::unique_ptr<TreeUpdater> approx_maker{TreeUpdater::Create("grow_gpu_approx", ctx, &task)};
+  approx_maker->Configure(Args{});
 
   TrainParam param;
   param.UpdateAllowUnknown(Args{});
@@ -490,13 +447,13 @@ RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) {
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
   RegTree tree;
-  approx_maker.Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
-                      {&tree});
+  approx_maker->Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
+                       {&tree});
   return tree;
 }
 
 void VerifyApproxColumnSplit(bst_idx_t rows, bst_feature_t cols, RegTree const& expected_tree) {
-  Context ctx(MakeCUDACtx(GPUIDX));
+  auto ctx = MakeCUDACtx(DistGpuIdx());
 
   auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
   auto const world_size = collective::GetWorldSize();
diff --git a/tests/cpp/tree/test_histmaker.cc b/tests/cpp/tree/test_histmaker.cc
deleted file mode 100644
index 888790aa7c3c..000000000000
--- a/tests/cpp/tree/test_histmaker.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-/**
- * Copyright 2019-2024, XGBoost Contributors
- */
-#include <gtest/gtest.h>
-#include <xgboost/tree_model.h>
-#include <xgboost/tree_updater.h>
-
-#include "../../../src/tree/param.h"  // for TrainParam
-#include "../helpers.h"
-#include "test_column_split.h"  // for GenerateCatDMatrix
-
-namespace xgboost::tree {
-TEST(GrowHistMaker, InteractionConstraint) {
-  auto constexpr kRows = 32;
-  auto constexpr kCols = 16;
-  auto p_dmat = GenerateCatDMatrix(kRows, kCols, 0.0, false);
-  Context ctx;
-
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
-  gpair.Data()->Copy(GenerateRandomGradients(kRows));
-
-  ObjInfo task{ObjInfo::kRegression};
-  {
-    // With constraints
-    RegTree tree{1, kCols};
-
-    std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
-    TrainParam param;
-    param.UpdateAllowUnknown(
-        Args{{"interaction_constraints", "[[0, 1]]"}, {"num_feature", std::to_string(kCols)}});
-    std::vector<HostDeviceVector<bst_node_t>> position(1);
-    updater->Configure(Args{});
-    updater->Update(&param, &gpair, p_dmat.get(), position, {&tree});
-
-    ASSERT_EQ(tree.NumExtraNodes(), 4);
-    ASSERT_EQ(tree[0].SplitIndex(), 1);
-
-    ASSERT_EQ(tree[tree[0].LeftChild()].SplitIndex(), 0);
-    ASSERT_EQ(tree[tree[0].RightChild()].SplitIndex(), 0);
-  }
-  {
-    // Without constraints
-    RegTree tree{1u, kCols};
-
-    std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
-    std::vector<HostDeviceVector<bst_node_t>> position(1);
-    TrainParam param;
-    param.Init(Args{});
-    updater->Configure(Args{});
-    updater->Update(&param, &gpair, p_dmat.get(), position, {&tree});
-
-    ASSERT_EQ(tree.NumExtraNodes(), 10);
-    ASSERT_EQ(tree[0].SplitIndex(), 1);
-
-    ASSERT_NE(tree[tree[0].LeftChild()].SplitIndex(), 0);
-    ASSERT_NE(tree[tree[0].RightChild()].SplitIndex(), 0);
-  }
-}
-}  // namespace xgboost::tree