From 582ea104b5a655e0b24b9d03b33edc98c80cbc8c Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 15 Aug 2024 21:41:59 +0800
Subject: [PATCH 01/19] [EM] Enable prediction cache for GPU. (#10707)

- Use `UpdatePosition` for all nodes and skip `FinalizePosition` when external memory is used.
- Create `encode/decode` for node position, this is just as a refactor.
- Reuse code between update position and finalization.
---
 src/common/categorical.h                      |  18 +-
 src/common/device_helpers.cuh                 |  28 +-
 src/common/device_vector.cuh                  |  11 +
 src/common/partition_builder.h                |   8 +-
 src/common/quantile.cu                        |   1 +
 src/data/ellpack_page.cuh                     |  26 +-
 src/objective/adaptive.cc                     |  23 +-
 src/objective/adaptive.cu                     |  15 +-
 src/tree/gpu_hist/evaluate_splits.cu          |   7 +-
 src/tree/gpu_hist/row_partitioner.cu          |   1 +
 src/tree/gpu_hist/row_partitioner.cuh         |  85 ++---
 src/tree/param.h                              |   3 -
 src/tree/sample_position.h                    |  21 ++
 src/tree/tree_model.cc                        |  10 +-
 src/tree/updater_colmaker.cc                  |  26 +-
 src/tree/updater_gpu_common.cuh               |  18 +-
 src/tree/updater_gpu_hist.cu                  | 318 +++++++++---------
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |   6 +-
 tests/cpp/tree/hist/test_expand_entry.cc      |   3 +-
 tests/cpp/tree/test_gpu_hist.cu               |  73 ++--
 20 files changed, 376 insertions(+), 325 deletions(-)
 create mode 100644 src/tree/sample_position.h
diff --git a/src/common/categorical.h b/src/common/categorical.h
index 32b771ad68f2..de9ffe04b7c2 100644
--- a/src/common/categorical.h
+++ b/src/common/categorical.h
@@ -1,20 +1,17 @@
 /**
- * Copyright 2020-2023, XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
  * \file categorical.h
  */
 #ifndef XGBOOST_COMMON_CATEGORICAL_H_
 #define XGBOOST_COMMON_CATEGORICAL_H_
 
-#include <limits>
-
 #include "bitfield.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/span.h"
+#include "xgboost/tree_model.h"
 
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 using CatBitField = LBitField32;
 using KCatBitField = CLBitField32;
 
@@ -94,7 +91,12 @@ XGBOOST_DEVICE inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot
 struct IsCatOp {
   XGBOOST_DEVICE bool operator()(FeatureType ft) { return ft == FeatureType::kCategorical; }
 };
-}  // namespace common
-}  // namespace xgboost
+
+inline auto GetNodeCats(common::Span<CatBitField::value_type const> categories,
+                        RegTree::CategoricalSplitMatrix::Segment seg) {
+  KCatBitField node_cats{categories.subspan(seg.beg, seg.size)};
+  return node_cats;
+}
+}  // namespace xgboost::common
 
 #endif  // XGBOOST_COMMON_CATEGORICAL_H_
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 31ea232e59eb..6cd0cd76a47a 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -16,12 +16,9 @@
 #include <cstddef>  // for size_t
 #include <cub/cub.cuh>
 #include <cub/util_type.cuh>  // for UnitWord
-#include <sstream>
-#include <string>
 #include <tuple>
 #include <vector>
 
-#include "../collective/communicator-inl.h"
 #include "common.h"
 #include "device_vector.cuh"
 #include "xgboost/host_device_vector.h"
@@ -375,19 +372,24 @@ void CopyDeviceSpanToVector(std::vector<T> *dst, xgboost::common::Span<const T>
                                 cudaMemcpyDeviceToHost));
 }
 
-template <class HContainer, class DContainer>
-void CopyToD(HContainer const &h, DContainer *d) {
-  if (h.empty()) {
-    d->clear();
+template <class Src, class Dst>
+void CopyTo(Src const &src, Dst *dst) {
+  if (src.empty()) {
+    dst->clear();
     return;
   }
-  d->resize(h.size());
-  using HVT = std::remove_cv_t<typename HContainer::value_type>;
-  using DVT = std::remove_cv_t<typename DContainer::value_type>;
-  static_assert(std::is_same<HVT, DVT>::value,
+  dst->resize(src.size());
+  using SVT = std::remove_cv_t<typename Src::value_type>;
+  using DVT = std::remove_cv_t<typename Dst::value_type>;
+  static_assert(std::is_same<SVT, DVT>::value,
                 "Host and device containers must have same value type.");
-  dh::safe_cuda(cudaMemcpyAsync(d->data().get(), h.data(), h.size() * sizeof(HVT),
-                                cudaMemcpyHostToDevice));
+  dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(dst->data()), src.data(),
+                                src.size() * sizeof(SVT), cudaMemcpyDefault));
+}
+
+template <class HContainer, class DContainer>
+void CopyToD(HContainer const &h, DContainer *d) {
+  CopyTo(h, d);
 }
 
 // Keep track of pinned memory allocation
diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
index 2587ce719780..9abcbb1d1a8b 100644
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@@ -307,6 +307,7 @@ class DeviceUVector {
 
  public:
   DeviceUVector() = default;
+  explicit DeviceUVector(std::size_t n) { this->resize(n); }
   DeviceUVector(DeviceUVector const &that) = delete;
   DeviceUVector &operator=(DeviceUVector const &that) = delete;
   DeviceUVector(DeviceUVector &&that) = default;
@@ -330,7 +331,17 @@ class DeviceUVector {
     data_.resize(n, v);
 #endif
   }
+
+  void clear() {  // NOLINT
+#if defined(XGBOOST_USE_RMM)
+    this->data_.resize(0, rmm::cuda_stream_per_thread);
+#else
+    this->data_.clear();
+#endif  // defined(XGBOOST_USE_RMM)
+  }
+
   [[nodiscard]] std::size_t size() const { return data_.size(); }  // NOLINT
+  [[nodiscard]] bool empty() const { return this->size() == 0; }   // NOLINT
 
   [[nodiscard]] auto begin() { return data_.begin(); }  // NOLINT
   [[nodiscard]] auto end() { return data_.end(); }      // NOLINT
diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h
index 98c876e849a0..54febd750602 100644
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -20,6 +20,7 @@
 #include "column_matrix.h"
 #include "xgboost/context.h"
 #include "xgboost/tree_model.h"
+#include "../tree/sample_position.h"  // for SamplePosition
 
 namespace xgboost::common {
 // The builder is required for samples partition to left and rights children for set of nodes
@@ -364,13 +365,14 @@ class PartitionBuilder {
   }
 
   // Copy row partitions into global cache for reuse in objective
-  template <typename Sampledp>
+  template <typename Invalidp>
   void LeafPartition(Context const* ctx, RegTree const& tree, RowSetCollection const& row_set,
-                     std::vector<bst_node_t>* p_position, Sampledp sampledp) const {
+                     std::vector<bst_node_t>* p_position, Invalidp invalidp) const {
     auto& h_pos = *p_position;
     h_pos.resize(row_set.Data()->size(), std::numeric_limits<bst_node_t>::max());
 
     auto p_begin = row_set.Data()->data();
+    // For each node, walk through all the samples that fall in this node.
     ParallelFor(row_set.Size(), ctx->Threads(), [&](size_t i) {
       auto const& node = row_set[i];
       if (node.node_id < 0) {
@@ -381,7 +383,7 @@ class PartitionBuilder {
         size_t ptr_offset = node.end() - p_begin;
         CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
         for (auto idx = node.begin(); idx != node.end(); ++idx) {
-          h_pos[*idx] = sampledp(*idx) ? ~node.node_id : node.node_id;
+          h_pos[*idx] = tree::SamplePosition::Encode(node.node_id, !invalidp(*idx));
         }
       }
     });
diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index d0356ae421c7..eab37f45ed30 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -14,6 +14,7 @@
 
 #include "../collective/allgather.h"
 #include "../collective/allreduce.h"
+#include "../collective/communicator-inl.h"  // for GetWorldSize, GetRank
 #include "categorical.h"
 #include "common.h"
 #include "device_helpers.cuh"
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index e494afb3e9a4..f11bdfae1d97 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -6,6 +6,8 @@
 
 #include <thrust/binary_search.h>
 
+#include <limits>  // for numeric_limits
+
 #include "../common/categorical.h"
 #include "../common/compressed_iterator.h"
 #include "../common/device_helpers.cuh"
@@ -21,22 +23,26 @@ namespace xgboost {
  * Does not own underlying memory and may be trivially copied into kernels.
  */
 struct EllpackDeviceAccessor {
-  /*! \brief Whether or not if the matrix is dense. */
+  /** @brief Whether or not if the matrix is dense. */
   bool is_dense;
-  /*! \brief Row length for ELLPACK, equal to number of features. */
+  /** @brief Row length for ELLPACK, equal to number of features when the data is dense. */
   bst_idx_t row_stride;
-  bst_idx_t base_rowid{0};
-  bst_idx_t n_rows{0};
+  /** @brief Starting index of the rows. Used for external memory. */
+  bst_idx_t base_rowid;
+  /** @brief Number of rows in this batch. */
+  bst_idx_t n_rows;
+  /** @brief Acessor for the gradient index. */
   common::CompressedIterator<std::uint32_t> gidx_iter;
-  /*! \brief Minimum value for each feature. Size equals to number of features. */
+  /** @brief Minimum value for each feature. Size equals to number of features. */
   common::Span<const float> min_fvalue;
-  /*! \brief Histogram cut pointers. Size equals to (number of features + 1). */
+  /** @brief Histogram cut pointers. Size equals to (number of features + 1). */
   common::Span<const std::uint32_t> feature_segments;
-  /*! \brief Histogram cut values. Size equals to (bins per feature * number of features). */
+  /** @brief Histogram cut values. Size equals to (bins per feature * number of features). */
   common::Span<const float> gidx_fvalue_map;
-
+  /** @brief Type of each feature, categorical or numerical. */
   common::Span<const FeatureType> feature_types;
 
+  EllpackDeviceAccessor() = delete;
   EllpackDeviceAccessor(DeviceOrd device, std::shared_ptr<const common::HistogramCuts> cuts,
                         bool is_dense, size_t row_stride, size_t base_rowid, size_t n_rows,
                         common::CompressedIterator<uint32_t> gidx_iter,
@@ -108,10 +114,10 @@ struct EllpackDeviceAccessor {
     return idx;
   }
 
-  [[nodiscard]] __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ float GetFvalue(size_t ridx, size_t fidx) const {
     auto gidx = GetBinIndex(ridx, fidx);
     if (gidx == -1) {
-      return nan("");
+      return std::numeric_limits<float>::quiet_NaN();
     }
     return gidx_fvalue_map[gidx];
   }
diff --git a/src/objective/adaptive.cc b/src/objective/adaptive.cc
index e7778c464762..3c92013f4266 100644
--- a/src/objective/adaptive.cc
+++ b/src/objective/adaptive.cc
@@ -3,18 +3,18 @@
  */
 #include "adaptive.h"
 
-#include <algorithm>                       // std::transform,std::find_if,std::copy,std::unique
-#include <cmath>                           // std::isnan
-#include <cstddef>                         // std::size_t
-#include <iterator>                        // std::distance
-#include <vector>                          // std::vector
+#include <algorithm>  // std::transform,std::find_if,std::copy,std::unique
+#include <cmath>      // std::isnan
+#include <cstddef>    // std::size_t
+#include <iterator>   // std::distance
+#include <vector>     // std::vector
 
 #include "../common/algorithm.h"           // ArgSort
-#include "../common/common.h"              // AssertGPUSupport
 #include "../common/numeric.h"             // RunLengthEncode
 #include "../common/stats.h"               // Quantile,WeightedQuantile
 #include "../common/threading_utils.h"     // ParallelFor
 #include "../common/transform_iterator.h"  // MakeIndexTransformIter
+#include "../tree/sample_position.h"       // for SamplePosition
 #include "xgboost/base.h"                  // bst_node_t
 #include "xgboost/context.h"               // Context
 #include "xgboost/data.h"                  // MetaInfo
@@ -23,6 +23,10 @@
 #include "xgboost/span.h"                  // Span
 #include "xgboost/tree_model.h"            // RegTree
 
+#if !defined(XGBOOST_USE_CUDA)
+#include "../common/common.h"  // AssertGPUSupport
+#endif                         // !defined(XGBOOST_USE_CUDA)
+
 namespace xgboost::obj::detail {
 void EncodeTreeLeafHost(Context const* ctx, RegTree const& tree,
                         std::vector<bst_node_t> const& position, std::vector<size_t>* p_nptr,
@@ -37,9 +41,10 @@ void EncodeTreeLeafHost(Context const* ctx, RegTree const& tree,
     sorted_pos[i] = position[ridx[i]];
   }
   // find the first non-sampled row
-  size_t begin_pos =
-      std::distance(sorted_pos.cbegin(), std::find_if(sorted_pos.cbegin(), sorted_pos.cend(),
-                                                      [](bst_node_t nidx) { return nidx >= 0; }));
+  size_t begin_pos = std::distance(
+      sorted_pos.cbegin(),
+      std::find_if(sorted_pos.cbegin(), sorted_pos.cend(),
+                   [](bst_node_t nidx) { return tree::SamplePosition::IsValid(nidx); }));
   CHECK_LE(begin_pos, sorted_pos.size());
 
   std::vector<bst_node_t> leaf;
diff --git a/src/objective/adaptive.cu b/src/objective/adaptive.cu
index 235e284198f3..7f2a9175d91d 100644
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -3,13 +3,14 @@
  */
 #include <thrust/sort.h>
 
-#include <cstdint>                     // std::int32_t
-#include <cub/cub.cuh>                 // NOLINT
+#include <cstdint>      // std::int32_t
+#include <cub/cub.cuh>  // NOLINT
 
 #include "../collective/aggregator.h"
 #include "../common/cuda_context.cuh"  // CUDAContext
 #include "../common/device_helpers.cuh"
 #include "../common/stats.cuh"
+#include "../tree/sample_position.h"  // for SamplePosition
 #include "adaptive.h"
 #include "xgboost/context.h"
 
@@ -30,10 +31,12 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
   // sort row index according to node index
   thrust::stable_sort_by_key(cuctx->TP(), sorted_position.begin(),
                              sorted_position.begin() + n_samples, p_ridx->begin());
-  size_t beg_pos =
-      thrust::find_if(cuctx->CTP(), sorted_position.cbegin(), sorted_position.cend(),
-                      [] XGBOOST_DEVICE(bst_node_t nidx) { return nidx >= 0; }) -
-      sorted_position.cbegin();
+  // Find the first one that's not sampled (nidx not been negated).
+  size_t beg_pos = thrust::find_if(cuctx->CTP(), sorted_position.cbegin(), sorted_position.cend(),
+                                   [] XGBOOST_DEVICE(bst_node_t nidx) {
+                                     return tree::SamplePosition::IsValid(nidx);
+                                   }) -
+                   sorted_position.cbegin();
   if (beg_pos == sorted_position.size()) {
     auto& leaf = p_nidx->HostVector();
     tree.WalkTree([&](bst_node_t nidx) {
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 5e225a13f142..631f2bd8f27c 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -1,13 +1,12 @@
 /**
  * Copyright 2020-2024, XGBoost Contributors
  */
-#include <algorithm>  // std::max
-#include <vector>
-#include <limits>
+#include <algorithm>  // for :max
+#include <limits>     // for numeric_limits
 
 #include "../../collective/allgather.h"
+#include "../../collective/communicator-inl.h"  // for GetWorldSize, GetRank
 #include "../../common/categorical.h"
-#include "../../data/ellpack_page.cuh"
 #include "evaluate_splits.cuh"
 #include "expand_entry.cuh"
 
diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 61e42d909073..c768c89dfaea 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -15,6 +15,7 @@ void RowPartitioner::Reset(Context const* ctx, bst_idx_t n_samples, bst_idx_t ba
   ridx_.resize(n_samples);
   ridx_tmp_.resize(n_samples);
   tmp_.clear();
+  n_nodes_ = 1;  // Root
 
   CHECK_LE(n_samples, std::numeric_limits<cuda_impl::RowIndexT>::max());
   ridx_segments_.emplace_back(
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index a811155d4477..5f8f0a30b31a 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -19,7 +19,9 @@
 namespace xgboost::tree {
 namespace cuda_impl {
 using RowIndexT = std::uint32_t;
-}
+// TODO(Rory): Can be larger. To be tuned alongside other batch operations.
+static const std::int32_t kMaxUpdatePositionBatchSize = 32;
+}  // namespace cuda_impl
 
 /**
  * @brief Used to demarcate a contiguous set of row indices associated with some tree
@@ -37,8 +39,6 @@ struct Segment {
   __host__ __device__ bst_idx_t Size() const { return end - begin; }
 };
 
-// TODO(Rory): Can be larger. To be tuned alongside other batch operations.
-static const int kMaxUpdatePositionBatchSize = 32;
 template <typename OpDataT>
 struct PerNodeData {
   Segment segment;
@@ -46,10 +46,10 @@ struct PerNodeData {
 };
 
 template <typename BatchIterT>
-__device__ __forceinline__ void AssignBatch(BatchIterT batch_info, std::size_t global_thread_idx,
-                                            int* batch_idx, std::size_t* item_idx) {
+XGBOOST_DEV_INLINE void AssignBatch(BatchIterT batch_info, std::size_t global_thread_idx,
+                                    int* batch_idx, std::size_t* item_idx) {
   cuda_impl::RowIndexT sum = 0;
-  for (int i = 0; i < kMaxUpdatePositionBatchSize; i++) {
+  for (int i = 0; i < cuda_impl::kMaxUpdatePositionBatchSize; i++) {
     if (sum + batch_info[i].segment.Size() > global_thread_idx) {
       *batch_idx = i;
       *item_idx = (global_thread_idx - sum) + batch_info[i].segment.begin;
@@ -59,10 +59,10 @@ __device__ __forceinline__ void AssignBatch(BatchIterT batch_info, std::size_t g
   }
 }
 
-template <int kBlockSize, typename RowIndexT, typename OpDataT>
+template <int kBlockSize, typename OpDataT>
 __global__ __launch_bounds__(kBlockSize) void SortPositionCopyKernel(
-    dh::LDGIterator<PerNodeData<OpDataT>> batch_info, common::Span<RowIndexT> d_ridx,
-    const common::Span<const RowIndexT> ridx_tmp, std::size_t total_rows) {
+    dh::LDGIterator<PerNodeData<OpDataT>> batch_info, common::Span<cuda_impl::RowIndexT> d_ridx,
+    const common::Span<const cuda_impl::RowIndexT> ridx_tmp, bst_idx_t total_rows) {
   for (auto idx : dh::GridStrideRange<std::size_t>(0, total_rows)) {
     int batch_idx;
     std::size_t item_idx;
@@ -92,6 +92,7 @@ struct IndexFlagOp {
   }
 };
 
+// Scatter from `ridx_in` to `ridx_out`.
 template <typename OpDataT>
 struct WriteResultsFunctor {
   dh::LDGIterator<PerNodeData<OpDataT>> batch_info;
@@ -99,10 +100,12 @@ struct WriteResultsFunctor {
   cuda_impl::RowIndexT* ridx_out;
   cuda_impl::RowIndexT* counts;
 
-  __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) {
-    std::size_t scatter_address;
+  __device__ IndexFlagTuple operator()(IndexFlagTuple const& x) {
+    cuda_impl::RowIndexT scatter_address;
+    // Get the segment that this row belongs to.
     const Segment& segment = batch_info[x.batch_idx].segment;
     if (x.flag) {
+      // Go left.
       cuda_impl::RowIndexT num_previous_flagged = x.flag_scan - 1;  // -1 because inclusive scan
       scatter_address = segment.begin + num_previous_flagged;
     } else {
@@ -121,10 +124,14 @@ struct WriteResultsFunctor {
   }
 };
 
-template <typename RowIndexT, typename OpT, typename OpDataT>
+/**
+ * @param d_batch_info Node data, with the size of the input number of nodes.
+ */
+template <typename OpT, typename OpDataT>
 void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
-                       common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
-                       common::Span<cuda_impl::RowIndexT> d_counts, std::size_t total_rows, OpT op,
+                       common::Span<cuda_impl::RowIndexT> ridx,
+                       common::Span<cuda_impl::RowIndexT> ridx_tmp,
+                       common::Span<cuda_impl::RowIndexT> d_counts, bst_idx_t total_rows, OpT op,
                        dh::device_vector<int8_t>* tmp) {
   dh::LDGIterator<PerNodeData<OpDataT>> batch_info_itr(d_batch_info.data());
   WriteResultsFunctor<OpDataT> write_results{batch_info_itr, ridx.data(), ridx_tmp.data(),
@@ -134,22 +141,23 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
       thrust::make_transform_output_iterator(dh::TypedDiscard<IndexFlagTuple>(), write_results);
   auto counting = thrust::make_counting_iterator(0llu);
   auto input_iterator =
-      dh::MakeTransformIterator<IndexFlagTuple>(counting, [=] __device__(size_t idx) {
-        int batch_idx;
+      dh::MakeTransformIterator<IndexFlagTuple>(counting, [=] __device__(std::size_t idx) {
+        int nidx_in_batch;
         std::size_t item_idx;
-        AssignBatch(batch_info_itr, idx, &batch_idx, &item_idx);
-        auto op_res = op(ridx[item_idx], batch_idx, batch_info_itr[batch_idx].data);
-        return IndexFlagTuple{static_cast<cuda_impl::RowIndexT>(item_idx), op_res, batch_idx, op_res};
+        AssignBatch(batch_info_itr, idx, &nidx_in_batch, &item_idx);
+        auto go_left = op(ridx[item_idx], nidx_in_batch, batch_info_itr[nidx_in_batch].data);
+        return IndexFlagTuple{static_cast<cuda_impl::RowIndexT>(item_idx), go_left, nidx_in_batch,
+                              go_left};
       });
-  size_t temp_bytes = 0;
+  std::size_t temp_bytes = 0;
   if (tmp->empty()) {
     cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
-                                   IndexFlagOp(), total_rows);
+                                   IndexFlagOp{}, total_rows);
     tmp->resize(temp_bytes);
   }
   temp_bytes = tmp->size();
   cub::DeviceScan::InclusiveScan(tmp->data().get(), temp_bytes, input_iterator,
-                                 discard_write_iterator, IndexFlagOp(), total_rows);
+                                 discard_write_iterator, IndexFlagOp{}, total_rows);
 
   constexpr int kBlockSize = 256;
 
@@ -157,7 +165,7 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
   const int kItemsThread = 12;
   const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread);
 
-  SortPositionCopyKernel<kBlockSize, RowIndexT, OpDataT>
+  SortPositionCopyKernel<kBlockSize, OpDataT>
       <<<grid_size, kBlockSize, 0>>>(batch_info_itr, ridx, ridx_tmp, total_rows);
 }
 
@@ -168,8 +176,8 @@ struct NodePositionInfo {
   __device__ bool IsLeaf() { return left_child == -1; }
 };
 
-__device__ __forceinline__ int GetPositionFromSegments(std::size_t idx,
-                                                       const NodePositionInfo* d_node_info) {
+XGBOOST_DEV_INLINE int GetPositionFromSegments(std::size_t idx,
+                                               const NodePositionInfo* d_node_info) {
   int position = 0;
   NodePositionInfo node = d_node_info[position];
   while (!node.IsLeaf()) {
@@ -205,7 +213,6 @@ __global__ __launch_bounds__(kBlockSize) void FinalisePositionKernel(
 class RowPartitioner {
  public:
   using RowIndexT = cuda_impl::RowIndexT;
-  static constexpr bst_node_t kIgnoredTreePosition = -1;
 
  private:
   /**
@@ -232,6 +239,7 @@ class RowPartitioner {
   dh::device_vector<int8_t> tmp_;
   dh::PinnedMemory pinned_;
   dh::PinnedMemory pinned2_;
+  bst_node_t n_nodes_{0};  // Counter for internal checks.
 
  public:
   /**
@@ -255,6 +263,7 @@ class RowPartitioner {
    * \brief Gets all training rows in the set.
    */
   common::Span<const RowIndexT> GetRows();
+  [[nodiscard]] bst_node_t GetNumNodes() const { return n_nodes_; }
 
   /**
    * \brief Convenience method for testing
@@ -280,10 +289,14 @@ class RowPartitioner {
                            const std::vector<bst_node_t>& left_nidx,
                            const std::vector<bst_node_t>& right_nidx,
                            const std::vector<OpDataT>& op_data, UpdatePositionOpT op) {
-    if (nidx.empty()) return;
+    if (nidx.empty()) {
+      return;
+    }
+
     CHECK_EQ(nidx.size(), left_nidx.size());
     CHECK_EQ(nidx.size(), right_nidx.size());
     CHECK_EQ(nidx.size(), op_data.size());
+    this->n_nodes_ += (left_nidx.size() + right_nidx.size());
 
     auto h_batch_info = pinned2_.GetSpan<PerNodeData<OpDataT>>(nidx.size());
     dh::TemporaryArray<PerNodeData<OpDataT>> d_batch_info(nidx.size());
@@ -302,9 +315,9 @@ class RowPartitioner {
     dh::TemporaryArray<RowIndexT> d_counts(nidx.size(), 0);
 
     // Partition the rows according to the operator
-    SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
-        dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
-        total_rows, op, &tmp_);
+    SortPositionBatch<UpdatePositionOpT, OpDataT>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx_),
+                                                  dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
+                                                  total_rows, op, &tmp_);
     dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
                                   cudaMemcpyDefault));
     // TODO(Rory): this synchronisation hurts performance a lot
@@ -327,20 +340,16 @@ class RowPartitioner {
   }
 
   /**
-   * \brief Finalise the position of all training instances after tree construction is
+   * @brief Finalise the position of all training instances after tree construction is
    * complete. Does not update any other meta information in this data structure, so
    * should only be used at the end of training.
    *
-   *   When the task requires update leaf, this function will copy the node index into
-   *   p_out_position. The index is negated if it's being sampled in current iteration.
-   *
-   * \param p_out_position Node index for each row.
-   * \param op Device lambda. Should provide the row index and current position as an
+   * @param p_out_position Node index for each row.
+   * @param op Device lambda. Should provide the row index and current position as an
    *           argument and return the new position for this training instance.
-   * \param sampled A device lambda to inform the partitioner whether a row is sampled.
    */
   template <typename FinalisePositionOpT>
-  void FinalisePosition(common::Span<bst_node_t> d_out_position, FinalisePositionOpT op) {
+  void FinalisePosition(common::Span<bst_node_t> d_out_position, FinalisePositionOpT op) const {
     dh::TemporaryArray<NodePositionInfo> d_node_info_storage(ridx_segments_.size());
     dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
                                   sizeof(NodePositionInfo) * ridx_segments_.size(),
diff --git a/src/tree/param.h b/src/tree/param.h
index 5f32a786bc54..fab68f0c298e 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -10,14 +10,11 @@
 #include <algorithm>
 #include <cmath>
 #include <cstring>
-#include <limits>
 #include <string>
 #include <vector>
 
-#include "../common/categorical.h"
 #include "../common/linalg_op.h"
 #include "../common/math.h"
-#include "xgboost/data.h"
 #include "xgboost/linalg.h"
 #include "xgboost/parameter.h"
 
diff --git a/src/tree/sample_position.h b/src/tree/sample_position.h
new file mode 100644
index 000000000000..a09e224df36b
--- /dev/null
+++ b/src/tree/sample_position.h
@@ -0,0 +1,21 @@
+/**
+ * Copyright 2024, XGBoost Contributors
+ */
+#pragma once
+#include "xgboost/base.h"  // for bst_node_t
+
+namespace xgboost::tree {
+// Utility for maniputing the node index. This is used by the tree methods and the
+// adaptive objectives to share the node index. A row is invalid if it's not used in the
+// last iteration (due to sampling). For these rows, the corresponding tree node index is
+// negated.
+struct SamplePosition {
+  [[nodiscard]] bst_node_t static XGBOOST_HOST_DEV_INLINE Encode(bst_node_t nidx, bool is_valid) {
+    return is_valid ? nidx : ~nidx;
+  }
+  [[nodiscard]] bst_node_t static XGBOOST_HOST_DEV_INLINE Decode(bst_node_t nidx) {
+    return IsValid(nidx) ? nidx : ~nidx;
+  }
+  [[nodiscard]] bool static XGBOOST_HOST_DEV_INLINE IsValid(bst_node_t nidx) { return nidx >= 0; }
+};
+}  // namespace xgboost::tree
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index 45834cc7755e..9b28a08e594b 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -8,15 +8,14 @@
 #include <xgboost/json.h>
 #include <xgboost/tree_model.h>
 
-#include <array>  // for array
 #include <cmath>
 #include <iomanip>
 #include <limits>
 #include <sstream>
 #include <type_traits>
 
-#include "../common/categorical.h"
-#include "../common/common.h"    // for EscapeU8
+#include "../common/categorical.h"  // for GetNodeCats
+#include "../common/common.h"       // for EscapeU8
 #include "../predictor/predict_fn.h"
 #include "io_utils.h"  // for GetElem
 #include "param.h"
@@ -1038,9 +1037,8 @@ void RegTree::SaveCategoricalSplit(Json* p_out) const {
       categories_nodes.GetArray().emplace_back(i);
       auto begin = categories.Size();
       categories_segments.GetArray().emplace_back(begin);
-      auto segment = split_categories_segments_[i];
-      auto node_categories = this->GetSplitCategories().subspan(segment.beg, segment.size);
-      common::KCatBitField const cat_bits(node_categories);
+      auto segment = this->split_categories_segments_[i];
+      auto cat_bits = common::GetNodeCats(this->GetSplitCategories(), segment);
       for (size_t i = 0; i < cat_bits.Capacity(); ++i) {
         if (cat_bits.Check(i)) {
           categories.GetArray().emplace_back(i);
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 45018da17adc..f71fd189db03 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -10,6 +10,7 @@
 
 #include "../common/error_msg.h"  // for NoCategorical
 #include "../common/random.h"
+#include "sample_position.h"  // for SamplePosition
 #include "constraints.h"
 #include "param.h"
 #include "split_evaluator.h"
@@ -515,7 +516,7 @@ class ColMaker: public TreeUpdater {
       common::ParallelFor(p_fmat->Info().num_row_, this->ctx_->Threads(), [&](auto ridx) {
         CHECK_LT(ridx, position_.size()) << "ridx exceed bound "
                                          << "ridx=" << ridx << " pos=" << position_.size();
-        const int nid = this->DecodePosition(ridx);
+        const int nid = SamplePosition::Decode(position_[ridx]);
         if (tree[nid].IsLeaf()) {
           // mark finish when it is not a fresh leaf
           if (tree[nid].RightChild() == -1) {
@@ -560,14 +561,14 @@ class ColMaker: public TreeUpdater {
           auto col = page[fid];
           common::ParallelFor(col.size(), this->ctx_->Threads(), [&](auto j) {
             const bst_uint ridx = col[j].index;
-            const int nid = this->DecodePosition(ridx);
+            bst_node_t nidx = SamplePosition::Decode(position_[ridx]);
             const bst_float fvalue = col[j].fvalue;
             // go back to parent, correct those who are not default
-            if (!tree[nid].IsLeaf() && tree[nid].SplitIndex() == fid) {
-              if (fvalue < tree[nid].SplitCond()) {
-                this->SetEncodePosition(ridx, tree[nid].LeftChild());
+            if (!tree[nidx].IsLeaf() && tree[nidx].SplitIndex() == fid) {
+              if (fvalue < tree[nidx].SplitCond()) {
+                this->SetEncodePosition(ridx, tree[nidx].LeftChild());
               } else {
-                this->SetEncodePosition(ridx, tree[nid].RightChild());
+                this->SetEncodePosition(ridx, tree[nidx].RightChild());
               }
             }
           });
@@ -576,17 +577,10 @@ class ColMaker: public TreeUpdater {
     }
     // utils to get/set position, with encoded format
     // return decoded position
-    inline int DecodePosition(bst_uint ridx) const {
-      const int pid = position_[ridx];
-      return pid < 0 ? ~pid : pid;
-    }
     // encode the encoded position value for ridx
-    inline void SetEncodePosition(bst_uint ridx, int nid) {
-      if (position_[ridx] < 0) {
-        position_[ridx] = ~nid;
-      } else {
-        position_[ridx] = nid;
-      }
+    void SetEncodePosition(bst_idx_t ridx, bst_node_t nidx) {
+      bool is_invalid = position_[ridx] < 0;
+      position_[ridx] = SamplePosition::Encode(nidx, !is_invalid);
     }
     //  --data fields--
     const TrainParam& param_;
diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh
index 5d999d6d6e01..f4224a30e52a 100644
--- a/src/tree/updater_gpu_common.cuh
+++ b/src/tree/updater_gpu_common.cuh
@@ -6,8 +6,9 @@
 #include <ostream>  // for ostream
 
 #include "gpu_hist/histogram.cuh"
-#include "param.h"
+#include "param.h"  // for TrainParam
 #include "xgboost/base.h"
+#include "xgboost/task.h"  // for ObjInfo
 
 namespace xgboost::tree {
 struct GPUTrainingParam {
@@ -117,6 +118,21 @@ struct DeviceSplitCandidate {
   }
 };
 
+namespace cuda_impl {
+inline BatchParam HistBatch(TrainParam const& param) {
+  return {param.max_bin, TrainParam::DftSparseThreshold()};
+}
+
+inline BatchParam HistBatch(bst_bin_t max_bin) {
+  return {max_bin, TrainParam::DftSparseThreshold()};
+}
+
+inline BatchParam ApproxBatch(TrainParam const& p, common::Span<float const> hess,
+                              ObjInfo const& task) {
+  return BatchParam{p.max_bin, hess, !task.const_hess};
+}
+}  // namespace cuda_impl
+
 template <typename T>
 struct SumCallbackOp {
   // Running prefix
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 8ff0d61ab25f..573261f9c337 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -34,7 +34,8 @@
 #include "gpu_hist/row_partitioner.cuh"
 #include "hist/param.h"
 #include "param.h"
-#include "updater_gpu_common.cuh"
+#include "sample_position.h"       // for SamplePosition
+#include "updater_gpu_common.cuh"  // for HistBatch
 #include "xgboost/base.h"
 #include "xgboost/context.h"
 #include "xgboost/data.h"
@@ -43,11 +44,15 @@
 #include "xgboost/span.h"
 #include "xgboost/task.h"  // for ObjInfo
 #include "xgboost/tree_model.h"
+#include "xgboost/tree_updater.h"
 
 namespace xgboost::tree {
 DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
 
-// Manage memory for a single GPU
+using cuda_impl::ApproxBatch;
+using cuda_impl::HistBatch;
+
+// GPU tree updater implementation.
 struct GPUHistMakerDevice {
  private:
   GPUHistEvaluator evaluator_;
@@ -56,20 +61,29 @@ struct GPUHistMakerDevice {
   MetaInfo const& info_;
 
   DeviceHistogramBuilder histogram_;
+  // node idx for each sample
+  dh::device_vector<bst_node_t> positions_;
+  std::unique_ptr<RowPartitioner> row_partitioner_;
+
+ public:
+  // Extra data for each node that is passed to the update position function
+  struct NodeSplitData {
+    RegTree::Node split_node;
+    FeatureType split_type;
+    common::KCatBitField node_cats;
+  };
+  static_assert(std::is_trivially_copyable_v<NodeSplitData>);
 
  public:
   EllpackPageImpl const* page{nullptr};
   common::Span<FeatureType const> feature_types;
 
-  std::unique_ptr<RowPartitioner> row_partitioner;
   DeviceHistogramStorage<> hist{};
 
   dh::device_vector<GradientPair> d_gpair;  // storage for gpair;
   common::Span<GradientPair> gpair;
 
   dh::device_vector<int> monotone_constraints;
-  // node idx for each sample
-  dh::device_vector<bst_node_t> positions;
 
   TrainParam param;
 
@@ -143,10 +157,10 @@ struct GPUHistMakerDevice {
 
     quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, dmat->Info());
 
-    if (!row_partitioner) {
-      row_partitioner = std::make_unique<RowPartitioner>();
+    if (!row_partitioner_) {
+      row_partitioner_ = std::make_unique<RowPartitioner>();
     }
-    row_partitioner->Reset(ctx_, sample.sample_rows, page->base_rowid);
+    row_partitioner_->Reset(ctx_, sample.sample_rows, page->base_rowid);
     CHECK_EQ(page->base_rowid, 0);
 
     // Init histogram
@@ -182,7 +196,10 @@ struct GPUHistMakerDevice {
 
   void EvaluateSplits(const std::vector<GPUExpandEntry>& candidates, const RegTree& tree,
                                common::Span<GPUExpandEntry> pinned_candidates_out) {
-    if (candidates.empty()) return;
+    if (candidates.empty()) {
+      return;
+    }
+    this->monitor.Start(__func__);
     dh::TemporaryArray<EvaluateSplitInputs> d_node_inputs(2 * candidates.size());
     dh::TemporaryArray<DeviceSplitCandidate> splits_out(2 * candidates.size());
     std::vector<bst_node_t> nidx(2 * candidates.size());
@@ -234,12 +251,12 @@ struct GPUHistMakerDevice {
     dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
                                   entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
                                   cudaMemcpyDeviceToHost));
-    dh::DefaultStream().Sync();
+    this->monitor.Stop(__func__);
   }
 
   void BuildHist(int nidx) {
     auto d_node_hist = hist.GetNodeHistogram(nidx);
-    auto d_ridx = row_partitioner->GetRows(nidx);
+    auto d_ridx = row_partitioner_->GetRows(nidx);
     this->histogram_.BuildHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->Device()),
                                     feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
                                     d_node_hist, *quantiser);
@@ -262,14 +279,6 @@ struct GPUHistMakerDevice {
     return true;
   }
 
-  // Extra data for each node that is passed
-  // to the update position function
-  struct NodeSplitData {
-    RegTree::Node split_node;
-    FeatureType split_type;
-    common::KCatBitField node_cats;
-  };
-
   void UpdatePositionColumnSplit(EllpackDeviceAccessor d_matrix,
                                  std::vector<NodeSplitData> const& split_data,
                                  std::vector<bst_node_t> const& nidx,
@@ -321,10 +330,10 @@ struct GPUHistMakerDevice {
     };
     collective::SafeColl(rc);
 
-    row_partitioner->UpdatePositionBatch(
+    row_partitioner_->UpdatePositionBatch(
         nidx, left_nidx, right_nidx, split_data,
-        [=] __device__(bst_uint ridx, int split_index, NodeSplitData const& data) {
-          auto const index = ridx * num_candidates + split_index;
+        [=] __device__(bst_uint ridx, int nidx_in_batch, NodeSplitData const& data) {
+          auto const index = ridx * num_candidates + nidx_in_batch;
           bool go_left;
           if (missing_bits.Check(index)) {
             go_left = data.split_node.DefaultLeft();
@@ -335,11 +344,35 @@ struct GPUHistMakerDevice {
         });
   }
 
+  struct GoLeftOp {
+    EllpackDeviceAccessor d_matrix;
+
+    __device__ bool operator()(cuda_impl::RowIndexT ridx, NodeSplitData const& data) const {
+      RegTree::Node const& node = data.split_node;
+      // given a row index, returns the node id it belongs to
+      float cut_value = d_matrix.GetFvalue(ridx, node.SplitIndex());
+      // Missing value
+      bool go_left = true;
+      if (isnan(cut_value)) {
+        go_left = node.DefaultLeft();
+      } else {
+        if (data.split_type == FeatureType::kCategorical) {
+          go_left = common::Decision(data.node_cats.Bits(), cut_value);
+        } else {
+          go_left = cut_value <= node.SplitCond();
+        }
+      }
+      return go_left;
+    }
+  };
+
   void UpdatePosition(std::vector<GPUExpandEntry> const& candidates, RegTree* p_tree) {
     if (candidates.empty()) {
       return;
     }
 
+    monitor.Start(__func__);
+
     std::vector<bst_node_t> nidx(candidates.size());
     std::vector<bst_node_t> left_nidx(candidates.size());
     std::vector<bst_node_t> right_nidx(candidates.size());
@@ -347,12 +380,12 @@ struct GPUHistMakerDevice {
 
     for (size_t i = 0; i < candidates.size(); i++) {
       auto const& e = candidates[i];
-      RegTree::Node split_node = (*p_tree)[e.nid];
+      RegTree::Node const& split_node = (*p_tree)[e.nid];
       auto split_type = p_tree->NodeSplitType(e.nid);
-      nidx.at(i) = e.nid;
-      left_nidx.at(i) = split_node.LeftChild();
-      right_nidx.at(i) = split_node.RightChild();
-      split_data.at(i) = NodeSplitData{split_node, split_type, evaluator_.GetDeviceNodeCats(e.nid)};
+      nidx[i] = e.nid;
+      left_nidx[i] = split_node.LeftChild();
+      right_nidx[i] = split_node.RightChild();
+      split_data[i] = NodeSplitData{split_node, split_type, evaluator_.GetDeviceNodeCats(e.nid)};
 
       CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
     }
@@ -361,27 +394,15 @@ struct GPUHistMakerDevice {
 
     if (info_.IsColumnSplit()) {
       UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx);
+      monitor.Stop(__func__);
       return;
     }
-
-    row_partitioner->UpdatePositionBatch(
+    auto go_left = GoLeftOp{d_matrix};
+    row_partitioner_->UpdatePositionBatch(
         nidx, left_nidx, right_nidx, split_data,
-        [=] __device__(bst_uint ridx, int split_index, const NodeSplitData& data) {
-          // given a row index, returns the node id it belongs to
-          float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
-          // Missing value
-          bool go_left = true;
-          if (isnan(cut_value)) {
-            go_left = data.split_node.DefaultLeft();
-          } else {
-            if (data.split_type == FeatureType::kCategorical) {
-              go_left = common::Decision(data.node_cats.Bits(), cut_value);
-            } else {
-              go_left = cut_value <= data.split_node.SplitCond();
-            }
-          }
-          return go_left;
-        });
+        [=] __device__(cuda_impl::RowIndexT ridx, int /*nidx_in_batch*/,
+                       const NodeSplitData& data) { return go_left(ridx, data); });
+    monitor.Stop(__func__);
   }
 
   // After tree update is finished, update the position of all training
@@ -389,101 +410,70 @@ struct GPUHistMakerDevice {
   // prediction cache
   void FinalisePosition(RegTree const* p_tree, DMatrix* p_fmat, ObjInfo task,
                         HostDeviceVector<bst_node_t>* p_out_position) {
-    // Prediction cache will not be used with external memory
-    if (!p_fmat->SingleColBlock()) {
-      if (task.UpdateTreeLeaf()) {
-        LOG(FATAL) << "Current objective function can not be used with external memory.";
-      }
+    if (!p_fmat->SingleColBlock() && task.UpdateTreeLeaf()) {
+      LOG(FATAL) << "Current objective function can not be used with external memory.";
+    }
+    if (p_fmat->Info().num_row_ != row_partitioner_->GetRows().size()) {
+      // Subsampling with external memory. Not supported.
       p_out_position->Resize(0);
-      positions.clear();
+      positions_.clear();
       return;
     }
 
-    dh::TemporaryArray<RegTree::Node> d_nodes(p_tree->GetNodes().size());
-    dh::safe_cuda(cudaMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
-                                  d_nodes.size() * sizeof(RegTree::Node),
-                                  cudaMemcpyHostToDevice));
-    auto const& h_split_types = p_tree->GetSplitTypes();
-    auto const& categories = p_tree->GetSplitCategories();
-    auto const& categories_segments = p_tree->GetSplitCategoriesPtr();
+    p_out_position->SetDevice(ctx_->Device());
+    p_out_position->Resize(row_partitioner_->GetRows().size());
+    auto d_out_position = p_out_position->DeviceSpan();
 
-    dh::caching_device_vector<FeatureType> d_split_types;
-    dh::caching_device_vector<uint32_t> d_categories;
-    dh::caching_device_vector<RegTree::CategoricalSplitMatrix::Segment> d_categories_segments;
+    auto d_gpair = this->gpair;
+    auto encode_op = [=] __device__(bst_idx_t row_id, bst_node_t nidx) {
+      bool is_invalid = d_gpair[row_id].GetHess() - .0f == 0.f;
+      return SamplePosition::Encode(nidx, !is_invalid);
+    };  // NOLINT
 
-    if (!categories.empty()) {
-      dh::CopyToD(h_split_types, &d_split_types);
-      dh::CopyToD(categories, &d_categories);
-      dh::CopyToD(categories_segments, &d_categories_segments);
+    if (!p_fmat->SingleColBlock()) {
+      CHECK_EQ(row_partitioner_->GetNumNodes(), p_tree->NumNodes());
+      row_partitioner_->FinalisePosition(d_out_position, encode_op);
+      dh::CopyTo(d_out_position, &positions_);
+      return;
     }
 
-    FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
-                           dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments),
-                           p_out_position);
-  }
+    dh::caching_device_vector<uint32_t> categories;
+    dh::CopyToD(p_tree->GetSplitCategories(), &categories);
+    auto const& cat_segments = p_tree->GetSplitCategoriesPtr();
+    auto d_categories = dh::ToSpan(categories);
 
-  void FinalisePositionInPage(
-      EllpackPageImpl const* page, const common::Span<RegTree::Node> d_nodes,
-      common::Span<FeatureType const> d_feature_types, common::Span<uint32_t const> categories,
-      common::Span<RegTree::CategoricalSplitMatrix::Segment> categories_segments,
-      HostDeviceVector<bst_node_t>* p_out_position) {
     auto d_matrix = page->GetDeviceAccessor(ctx_->Device());
-    auto d_gpair = this->gpair;
-    p_out_position->SetDevice(ctx_->Device());
-    p_out_position->Resize(row_partitioner->GetRows().size());
-
-    auto new_position_op = [=] __device__(size_t row_id, int position) {
-      // What happens if user prune the tree?
-      if (!d_matrix.IsInRange(row_id)) {
-        return RowPartitioner::kIgnoredTreePosition;
-      }
-      auto node = d_nodes[position];
 
-      while (!node.IsLeaf()) {
-        bst_float element = d_matrix.GetFvalue(row_id, node.SplitIndex());
-        // Missing value
-        if (isnan(element)) {
-          position = node.DefaultChild();
-        } else {
-          bool go_left = true;
-          if (common::IsCat(d_feature_types, position)) {
-            auto node_cats = categories.subspan(categories_segments[position].beg,
-                                                categories_segments[position].size);
-            go_left = common::Decision(node_cats, element);
-          } else {
-            go_left = element <= node.SplitCond();
-          }
-          if (go_left) {
-            position = node.LeftChild();
-          } else {
-            position = node.RightChild();
-          }
-        }
-
-        node = d_nodes[position];
-      }
-
-      return position;
-    };  // NOLINT
+    std::vector<NodeSplitData> split_data(p_tree->NumNodes());
+    auto const& tree = *p_tree;
+    for (std::size_t i = 0, n = split_data.size(); i < n; ++i) {
+      RegTree::Node split_node = tree[i];
+      auto split_type = p_tree->NodeSplitType(i);
+      auto node_cats = common::GetNodeCats(d_categories, cat_segments[i]);
+      split_data[i] = NodeSplitData{std::move(split_node), split_type, node_cats};
+    }
 
-    auto d_out_position = p_out_position->DeviceSpan();
-    row_partitioner->FinalisePosition(d_out_position, new_position_op);
-
-    auto s_position = p_out_position->ConstDeviceSpan();
-    positions.resize(s_position.size());
-    dh::safe_cuda(cudaMemcpyAsync(positions.data().get(), s_position.data(),
-                                  s_position.size_bytes(), cudaMemcpyDeviceToDevice,
-                                  ctx_->CUDACtx()->Stream()));
-
-    dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) {
-      bst_node_t position = d_out_position[idx];
-      bool is_row_sampled = d_gpair[idx].GetHess() - .0f == 0.f;
-      d_out_position[idx] = is_row_sampled ? ~position : position;
-    });
+    auto go_left_op = GoLeftOp{d_matrix};
+    dh::caching_device_vector<NodeSplitData> d_split_data;
+    dh::CopyToD(split_data, &d_split_data);
+    auto s_split_data = dh::ToSpan(d_split_data);
+
+    row_partitioner_->FinalisePosition(d_out_position,
+                                      [=] __device__(bst_idx_t row_id, bst_node_t nidx) {
+                                        auto split_data = s_split_data[nidx];
+                                        auto node = split_data.split_node;
+                                        while (!node.IsLeaf()) {
+                                          auto go_left = go_left_op(row_id, split_data);
+                                          nidx = go_left ? node.LeftChild() : node.RightChild();
+                                          node = s_split_data[nidx].split_node;
+                                        }
+                                        return encode_op(row_id, nidx);
+                                      });
+    dh::CopyTo(d_out_position, &positions_);
   }
 
   bool UpdatePredictionCache(linalg::MatrixView<float> out_preds_d, RegTree const* p_tree) {
-    if (positions.empty()) {
+    if (positions_.empty()) {
       return false;
     }
 
@@ -491,20 +481,19 @@ struct GPUHistMakerDevice {
     CHECK(out_preds_d.Device().IsCUDA());
     CHECK_EQ(out_preds_d.Device().ordinal, ctx_->Ordinal());
 
-    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
-    auto d_position = dh::ToSpan(positions);
+    auto d_position = dh::ToSpan(positions_);
     CHECK_EQ(out_preds_d.Size(), d_position.size());
 
-    auto const& h_nodes = p_tree->GetNodes();
-    dh::caching_device_vector<RegTree::Node> nodes(h_nodes.size());
-    dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
-                                  h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice,
-                                  ctx_->CUDACtx()->Stream()));
-    auto d_nodes = dh::ToSpan(nodes);
+    // Use the nodes from tree, the leaf value might be changed by the objective since the
+    // last update tree call.
+    dh::caching_device_vector<RegTree::Node> nodes;
+    dh::CopyTo(p_tree->GetNodes(), &nodes);
+    common::Span<RegTree::Node> d_nodes = dh::ToSpan(nodes);
     CHECK_EQ(out_preds_d.Shape(1), 1);
     dh::LaunchN(d_position.size(), ctx_->CUDACtx()->Stream(),
                 [=] XGBOOST_DEVICE(std::size_t idx) mutable {
                   bst_node_t nidx = d_position[idx];
+                  nidx = SamplePosition::Decode(nidx);
                   auto weight = d_nodes[nidx].LeafValue();
                   out_preds_d(idx, 0) += weight;
                 });
@@ -512,7 +501,7 @@ struct GPUHistMakerDevice {
   }
 
   // num histograms is the number of contiguous histograms in memory to reduce over
-  void AllReduceHist(int nidx, int num_histograms) {
+  void AllReduceHist(bst_node_t nidx, int num_histograms) {
     monitor.Start("AllReduce");
     auto d_node_hist = hist.GetNodeHistogram(nidx).data();
     using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
@@ -529,7 +518,10 @@ struct GPUHistMakerDevice {
    * \brief Build GPU local histograms for the left and right child of some parent node
    */
   void BuildHistLeftRight(std::vector<GPUExpandEntry> const& candidates, const RegTree& tree) {
-    if (candidates.empty()) return;
+    if (candidates.empty()) {
+      return;
+    }
+    this->monitor.Start(__func__);
     // Some nodes we will manually compute histograms
     // others we will do by subtraction
     std::vector<int> hist_nidx;
@@ -572,14 +564,15 @@ struct GPUHistMakerDevice {
         this->AllReduceHist(subtraction_trick_nidx, 1);
       }
     }
+    this->monitor.Stop(__func__);
   }
 
   void ApplySplit(const GPUExpandEntry& candidate, RegTree* p_tree) {
     RegTree& tree = *p_tree;
 
     // Sanity check - have we created a leaf with no training instances?
-    if (!collective::IsDistributed() && row_partitioner) {
-      CHECK(row_partitioner->GetRows(candidate.nid).size() > 0)
+    if (!collective::IsDistributed() && row_partitioner_) {
+      CHECK(row_partitioner_->GetRows(candidate.nid).size() > 0)
           << "No training instances in this leaf!";
     }
 
@@ -659,6 +652,8 @@ struct GPUHistMakerDevice {
 
   void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo const* task,
                   RegTree* p_tree, HostDeviceVector<bst_node_t>* p_out_position) {
+    bool const is_single_block = p_fmat->SingleColBlock();
+
     auto& tree = *p_tree;
     // Process maximum 32 nodes at a time
     Driver<GPUExpandEntry> driver(param, 32);
@@ -684,30 +679,29 @@ struct GPUHistMakerDevice {
                    [&](const auto& e) { return driver.IsChildValid(e); });
 
       auto new_candidates =
-          pinned.GetSpan<GPUExpandEntry>(filtered_expand_set.size() * 2, GPUExpandEntry());
-
-      monitor.Start("UpdatePosition");
-      // Update position is only run when child is valid, instead of right after apply
-      // split (as in approx tree method).  Hense we have the finalise position call
-      // in GPU Hist.
-      this->UpdatePosition(filtered_expand_set, p_tree);
-      monitor.Stop("UpdatePosition");
+          pinned.GetSpan<GPUExpandEntry>(filtered_expand_set.size() * 2, GPUExpandEntry{});
+      // Update all the nodes if working with external memory, this saves us from working
+      // with the finalize position call, which adds an additional iteration and requires
+      // special handling for row index.
+      this->UpdatePosition(is_single_block ? filtered_expand_set : expand_set, p_tree);
 
-      monitor.Start("BuildHist");
       this->BuildHistLeftRight(filtered_expand_set, tree);
-      monitor.Stop("BuildHist");
 
-      monitor.Start("EvaluateSplits");
       this->EvaluateSplits(filtered_expand_set, *p_tree, new_candidates);
-      monitor.Stop("EvaluateSplits");
       dh::DefaultStream().Sync();
+
       driver.Push(new_candidates.begin(), new_candidates.end());
       expand_set = driver.Pop();
     }
-
-    monitor.Start("FinalisePosition");
+    // Row partitioner can have lesser nodes than the tree since we skip some leaf
+    // nodes. These nodes are handled in the `FinalisePosition` call. However, a leaf can
+    // be spliable before evaluation but invalid after evaluation as we have more
+    // restrictions like min loss change after evalaution. Therefore, the check condition
+    // is greater than or equal to.
+    if (is_single_block) {
+      CHECK_GE(p_tree->NumNodes(), this->row_partitioner_->GetNumNodes());
+    }
     this->FinalisePosition(p_tree, p_fmat, *task, p_out_position);
-    monitor.Stop("FinalisePosition");
   }
 };
 
@@ -767,12 +761,11 @@ class GPUHistMaker : public TreeUpdater {
     SafeColl(rc);
     this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
 
-    auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
     dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     info_->feature_types.SetDevice(ctx_->Device());
     maker = std::make_unique<GPUHistMakerDevice>(
         ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
-        *param, column_sampler_, info_->num_col_, batch_param, dmat->Info());
+        *param, column_sampler_, info_->num_col_, HistBatch(*param), dmat->Info());
 
     p_last_fmat_ = dmat;
     initialised_ = true;
@@ -798,14 +791,13 @@ class GPUHistMaker : public TreeUpdater {
     maker->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
   }
 
-  bool UpdatePredictionCache(const DMatrix* data,
-                             linalg::MatrixView<bst_float> p_out_preds) override {
+  bool UpdatePredictionCache(const DMatrix* data, linalg::MatrixView<float> p_out_preds) override {
     if (maker == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
       return false;
     }
-    monitor_.Start("UpdatePredictionCache");
+    monitor_.Start(__func__);
     bool result = maker->UpdatePredictionCache(p_out_preds, p_last_tree_);
-    monitor_.Stop("UpdatePredictionCache");
+    monitor_.Stop(__func__);
     return result;
   }
 
@@ -881,10 +873,9 @@ class GPUGlobalApproxMaker : public TreeUpdater {
 
     auto const& info = p_fmat->Info();
     info.feature_types.SetDevice(ctx_->Device());
-    auto batch = BatchParam{param->max_bin, hess, !task_->const_hess};
     maker_ = std::make_unique<GPUHistMakerDevice>(
         ctx_, !p_fmat->SingleColBlock(), info.feature_types.ConstDeviceSpan(), info.num_row_,
-        *param, column_sampler_, info.num_col_, batch, p_fmat->Info());
+        *param, column_sampler_, info.num_col_, ApproxBatch(*param, hess, *task_), p_fmat->Info());
 
     std::size_t t_idx{0};
     for (xgboost::RegTree* tree : trees) {
@@ -927,14 +918,13 @@ class GPUGlobalApproxMaker : public TreeUpdater {
     maker_->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
   }
 
-  bool UpdatePredictionCache(const DMatrix* data,
-                             linalg::MatrixView<bst_float> p_out_preds) override {
+  bool UpdatePredictionCache(const DMatrix* data, linalg::MatrixView<float> p_out_preds) override {
     if (maker_ == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
       return false;
     }
-    monitor_.Start("UpdatePredictionCache");
+    monitor_.Start(__func__);
     bool result = maker_->UpdatePredictionCache(p_out_preds, p_last_tree_);
-    monitor_.Stop("UpdatePredictionCache");
+    monitor_.Stop(__func__);
     return result;
   }
 
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index f891d73f502f..86080a797beb 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -67,9 +67,9 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
                                 h_batch_info.size() * sizeof(PerNodeData<int>), cudaMemcpyDefault,
                                 nullptr));
   dh::device_vector<int8_t> tmp;
-  SortPositionBatch<uint32_t, decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
-                                                 dh::ToSpan(ridx_tmp), dh::ToSpan(counts),
-                                                 total_rows, op, &tmp);
+  SortPositionBatch<decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
+                                       dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op,
+                                       &tmp);
 
   auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; };
   for (size_t i = 0; i < segments.size(); i++) {
diff --git a/tests/cpp/tree/hist/test_expand_entry.cc b/tests/cpp/tree/hist/test_expand_entry.cc
index c47615688d06..23da825205bd 100644
--- a/tests/cpp/tree/hist/test_expand_entry.cc
+++ b/tests/cpp/tree/hist/test_expand_entry.cc
@@ -1,10 +1,11 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/json.h>        // for Json
 #include <xgboost/tree_model.h>  // for RegTree
 
+#include "../../../../src/common/categorical.h"  // for CatBitField
 #include "../../../../src/tree/hist/expand_entry.h"
 
 namespace xgboost::tree {
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 5d1f435de533..570ebe76c3da 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -5,7 +5,7 @@
 #include <xgboost/base.h>                // for Args
 #include <xgboost/context.h>             // for Context
 #include <xgboost/host_device_vector.h>  // for HostDeviceVector
-#include <xgboost/json.h>                // for Jons
+#include <xgboost/json.h>                // for Json
 #include <xgboost/task.h>                // for ObjInfo
 #include <xgboost/tree_model.h>          // for RegTree
 #include <xgboost/tree_updater.h>        // for TreeUpdater
@@ -14,32 +14,17 @@
 #include <string>  // for string
 #include <vector>  // for vector
 
-#include "../../../src/common/random.h"      // for GlobalRandom
-#include "../../../src/data/ellpack_page.h"  // for EllpackPage
-#include "../../../src/tree/param.h"         // for TrainParam
-#include "../collective/test_worker.h"       // for BaseMGPUTest
-#include "../filesystem.h"                   // dmlc::TemporaryDirectory
+#include "../../../src/common/random.h"  // for GlobalRandom
+#include "../../../src/tree/param.h"     // for TrainParam
+#include "../collective/test_worker.h"   // for BaseMGPUTest
+#include "../filesystem.h"               // dmlc::TemporaryDirectory
 #include "../helpers.h"
 
 namespace xgboost::tree {
-void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
-                size_t gpu_page_size, RegTree* tree, HostDeviceVector<bst_float>* preds,
-                float subsample = 1.0f, const std::string& sampling_method = "uniform",
-                int max_bin = 2) {
-  if (gpu_page_size > 0) {
-    // Loop over the batches and count the records
-    int64_t batch_count = 0;
-    int64_t row_count = 0;
-    for (const auto& batch : dmat->GetBatches<EllpackPage>(
-             ctx, BatchParam{max_bin, TrainParam::DftSparseThreshold()})) {
-      EXPECT_LT(batch.Size(), dmat->Info().num_row_);
-      batch_count++;
-      row_count += batch.Size();
-    }
-    EXPECT_GE(batch_count, 2);
-    EXPECT_EQ(row_count, dmat->Info().num_row_);
-  }
-
+namespace {
+void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat, bool is_ext,
+                RegTree* tree, HostDeviceVector<bst_float>* preds, float subsample,
+                const std::string& sampling_method, bst_bin_t max_bin) {
   Args args{
       {"max_depth", "2"},
       {"max_bin", std::to_string(max_bin)},
@@ -60,8 +45,13 @@ void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix
   hist_maker->Update(&param, gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
                      {tree});
   auto cache = linalg::MakeTensorView(ctx, preds->DeviceSpan(), preds->Size(), 1);
-  hist_maker->UpdatePredictionCache(dmat, cache);
+  if (subsample < 1.0 && is_ext) {
+    ASSERT_FALSE(hist_maker->UpdatePredictionCache(dmat, cache));
+  } else {
+    ASSERT_TRUE(hist_maker->UpdatePredictionCache(dmat, cache));
+  }
 }
+}  // anonymous namespace
 
 TEST(GpuHist, UniformSampling) {
   constexpr size_t kRows = 4096;
@@ -79,11 +69,11 @@ TEST(GpuHist, UniformSampling) {
   RegTree tree;
   HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
   Context ctx(MakeCUDACtx(0));
-  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
+  UpdateTree(&ctx, &gpair, dmat.get(), false, &tree, &preds, 1.0, "uniform", kRows);
   // Build another tree using sampling.
   RegTree tree_sampling;
   HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, DeviceOrd::CUDA(0));
-  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample, "uniform",
+  UpdateTree(&ctx, &gpair, dmat.get(), false, &tree_sampling, &preds_sampling, kSubsample, "uniform",
              kRows);
 
   // Make sure the predictions are the same.
@@ -110,12 +100,12 @@ TEST(GpuHist, GradientBasedSampling) {
   RegTree tree;
   HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
   Context ctx(MakeCUDACtx(0));
-  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
+  UpdateTree(&ctx, &gpair, dmat.get(), false, &tree, &preds, 1.0, "uniform", kRows);
 
   // Build another tree using sampling.
   RegTree tree_sampling;
   HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, DeviceOrd::CUDA(0));
-  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample,
+  UpdateTree(&ctx, &gpair, dmat.get(), false, &tree_sampling, &preds_sampling, kSubsample,
              "gradient_based", kRows);
 
   // Make sure the predictions are the same.
@@ -147,11 +137,11 @@ TEST(GpuHist, ExternalMemory) {
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
   HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
-  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
+  UpdateTree(&ctx, &gpair, dmat.get(), false, &tree, &preds, 1.0, "uniform", kRows);
   // Build another tree using multiple ELLPACK pages.
   RegTree tree_ext;
   HostDeviceVector<bst_float> preds_ext(kRows, 0.0, DeviceOrd::CUDA(0));
-  UpdateTree(&ctx, &gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, 1.0, "uniform", kRows);
+  UpdateTree(&ctx, &gpair, dmat_ext.get(), true, &tree_ext, &preds_ext, 1.0, "uniform", kRows);
 
   // Make sure the predictions are the same.
   auto preds_h = preds.ConstHostVector();
@@ -162,23 +152,26 @@ TEST(GpuHist, ExternalMemory) {
 }
 
 TEST(GpuHist, ExternalMemoryWithSampling) {
-  constexpr size_t kRows = 4096;
-  constexpr size_t kCols = 2;
-  constexpr size_t kPageSize = 1024;
+  constexpr size_t kRows = 4096, kCols = 2;
   constexpr float kSubsample = 0.5;
   const std::string kSamplingMethod = "gradient_based";
   common::GlobalRandom().seed(0);
 
   dmlc::TemporaryDirectory tmpdir;
+  Context ctx(MakeCUDACtx(0));
 
   // Create a single batch DMatrix.
-  std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrix(kRows, kCols, 1, tmpdir.path + "/cache"));
+  auto p_fmat = RandomDataGenerator{kRows, kCols, 0.0f}
+                    .Device(ctx.Device())
+                    .Batches(1)
+                    .GenerateSparsePageDMatrix("temp", true);
 
   // Create a DMatrix with multiple batches.
-  std::unique_ptr<DMatrix> dmat_ext(
-      CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache"));
+  auto p_fmat_ext = RandomDataGenerator{kRows, kCols, 0.0f}
+                        .Device(ctx.Device())
+                        .Batches(4)
+                        .GenerateSparsePageDMatrix("temp", true);
 
-  Context ctx(MakeCUDACtx(0));
   linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
@@ -187,13 +180,13 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
 
   RegTree tree;
   HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
-  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, kSubsample, kSamplingMethod, kRows);
+  UpdateTree(&ctx, &gpair, p_fmat.get(), true, &tree, &preds, kSubsample, kSamplingMethod, kRows);
 
   // Build another tree using multiple ELLPACK pages.
   common::GlobalRandom() = rng;
   RegTree tree_ext;
   HostDeviceVector<bst_float> preds_ext(kRows, 0.0, DeviceOrd::CUDA(0));
-  UpdateTree(&ctx, &gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, kSubsample,
+  UpdateTree(&ctx, &gpair, p_fmat_ext.get(), true, &tree_ext, &preds_ext, kSubsample,
              kSamplingMethod, kRows);
 
   // Make sure the predictions are the same.

From 2258bc870d2f73c78b77da5b105c77c6a8692755 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 16 Aug 2024 23:30:04 +0800
Subject: [PATCH 02/19] Add more tests and doc for QDM. (#10692)

---
 python-package/xgboost/core.py                | 14 ++++++++
 .../xgboost/testing/quantile_dmatrix.py       | 35 +++++++++++++++++++
 python-package/xgboost/testing/updater.py     |  4 +--
 .../test_device_quantile_dmatrix.py           |  4 +++
 tests/python-gpu/test_gpu_updaters.py         |  2 +-
 tests/python/test_quantile_dmatrix.py         |  4 +++
 tests/python/test_updaters.py                 |  2 +-
 7 files changed, 61 insertions(+), 4 deletions(-)
 create mode 100644 python-package/xgboost/testing/quantile_dmatrix.py

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 87465b8fb790..b65154cadc78 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1522,6 +1522,20 @@ class QuantileDMatrix(DMatrix):
 
     .. versionadded:: 1.7.0
 
+    Examples
+    --------
+
+    .. code-block::
+
+        from sklearn.datasets import make_regression
+        from sklearn.model_selection import train_test_split
+
+        X, y = make_regression()
+        X_train, X_test, y_train, y_test = train_test_split(X, y)
+        Xy_train = xgb.QuantileDMatrix(X_train, y_train)
+        # It's necessary to have the training DMatrix as a reference for valid quantiles.
+        Xy_test = xgb.QuantileDMatrix(X_test, y_test, ref=Xy_train)
+
     Parameters
     ----------
     max_bin :
diff --git a/python-package/xgboost/testing/quantile_dmatrix.py b/python-package/xgboost/testing/quantile_dmatrix.py
new file mode 100644
index 000000000000..b06cb550198f
--- /dev/null
+++ b/python-package/xgboost/testing/quantile_dmatrix.py
@@ -0,0 +1,35 @@
+"""QuantileDMatrix related tests."""
+
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+import xgboost as xgb
+
+from .data import make_batches
+
+
+def check_ref_quantile_cut(device: str) -> None:
+    """Check obtaining the same cut values given a reference."""
+    X, y, _ = (
+        data[0]
+        for data in make_batches(
+            n_samples_per_batch=8192,
+            n_features=16,
+            n_batches=1,
+            use_cupy=device.startswith("cuda"),
+        )
+    )
+
+    X_train, X_valid, y_train, y_valid = train_test_split(X, y)
+    Xy_train = xgb.QuantileDMatrix(X_train, y_train)
+    Xy_valid = xgb.QuantileDMatrix(X_valid, y_valid, ref=Xy_train)
+
+    cut_train = Xy_train.get_quantile_cut()
+    cut_valid = Xy_valid.get_quantile_cut()
+
+    np.testing.assert_allclose(cut_train[0], cut_valid[0])
+    np.testing.assert_allclose(cut_train[1], cut_valid[1])
+
+    Xy_valid = xgb.QuantileDMatrix(X_valid, y_valid)
+    cut_valid = Xy_valid.get_quantile_cut()
+    assert not np.allclose(cut_train[1], cut_valid[1])
diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py
index c6ba8256d682..7e360d42b252 100644
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -250,10 +250,10 @@ def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None:
     check_cut(n_entries, indptr, data, X.dtypes)
 
 
-def check_get_quantile_cut(tree_method: str) -> None:
+def check_get_quantile_cut(tree_method: str, device: str) -> None:
     """Check the quantile cut getter."""
 
-    use_cupy = tree_method == "gpu_hist"
+    use_cupy = device.startswith("cuda")
     check_get_quantile_cut_device(tree_method, False)
     if use_cupy:
         check_get_quantile_cut_device(tree_method, True)
diff --git a/tests/python-gpu/test_device_quantile_dmatrix.py b/tests/python-gpu/test_device_quantile_dmatrix.py
index acb275233973..d789dfab25e2 100644
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -8,6 +8,7 @@
 from xgboost import testing as tm
 from xgboost.testing.data import check_inf
 from xgboost.testing.data_iter import run_mixed_sparsity
+from xgboost.testing.quantile_dmatrix import check_ref_quantile_cut
 
 sys.path.append("tests/python")
 import test_quantile_dmatrix as tqd
@@ -142,6 +143,9 @@ def test_interoperability(self, tree_method: str, max_bin: int) -> None:
                 {"tree_method": "approx", "max_bin": max_bin}, Xy, num_boost_round=4
             )
 
+    def test_ref_quantile_cut(self) -> None:
+        check_ref_quantile_cut("cuda")
+
     @pytest.mark.skipif(**tm.no_cupy())
     def test_metainfo(self) -> None:
         import cupy as cp
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 587210cf2d6b..91e76a06f263 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -321,4 +321,4 @@ def test_issue8824(self):
 
     @pytest.mark.skipif(**tm.no_cudf())
     def test_get_quantile_cut(self) -> None:
-        check_get_quantile_cut("gpu_hist")
+        check_get_quantile_cut("hist", "cuda")
diff --git a/tests/python/test_quantile_dmatrix.py b/tests/python/test_quantile_dmatrix.py
index 2d9c15c8502f..7d06d8608cae 100644
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -17,6 +17,7 @@
 )
 from xgboost.testing.data import check_inf, np_dtypes
 from xgboost.testing.data_iter import run_mixed_sparsity
+from xgboost.testing.quantile_dmatrix import check_ref_quantile_cut
 
 
 class TestQuantileDMatrix:
@@ -266,6 +267,9 @@ def run_ref_dmatrix(self, rng: Any, tree_method: str, enable_cat: bool) -> None:
             dm_results["dvalid"]["rmse"], qdm_results["valid"]["rmse"]
         )
 
+    def test_ref_quantile_cut(self) -> None:
+        check_ref_quantile_cut("cpu")
+
     def test_ref_dmatrix(self) -> None:
         rng = np.random.RandomState(1994)
         self.run_ref_dmatrix(rng, "hist", True)
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index 8ec1fdd9d395..f4de8896866b 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -412,4 +412,4 @@ def test_quantile_loss(self, weighted: bool) -> None:
     @pytest.mark.skipif(**tm.no_pandas())
     @pytest.mark.parametrize("tree_method", ["hist"])
     def test_get_quantile_cut(self, tree_method: str) -> None:
-        check_get_quantile_cut(tree_method)
+        check_get_quantile_cut(tree_method, "cpu")

From abe65e376947a6e23bc4f8e17dc98d3ff5f122ee Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 17 Aug 2024 01:00:32 +0800
Subject: [PATCH 03/19] Reduce thread contention in column split histogram
 test. (#10708)

---
 tests/cpp/tree/hist/test_histogram.cc | 32 +++++++++++++++------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index 88c8d1cf6e64..11bdbd859b1c 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -222,10 +222,9 @@ TEST(CPUHistogram, SyncHist) {
   TestSyncHist(false);
 }
 
-void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_col_split) {
+void TestBuildHistogram(Context const* ctx, bool is_distributed, bool force_read_by_column, bool is_col_split) {
   size_t constexpr kNRows = 8, kNCols = 16;
   int32_t constexpr kMaxBins = 4;
-  Context ctx;
   auto p_fmat =
       RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
   if (is_col_split) {
@@ -233,7 +232,7 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
         p_fmat->SliceCol(collective::GetWorldSize(), collective::GetRank())};
   }
   auto const &gmat =
-      *(p_fmat->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{kMaxBins, 0.5}).begin());
+      *(p_fmat->GetBatches<GHistIndexMatrix>(ctx, BatchParam{kMaxBins, 0.5}).begin());
   uint32_t total_bins = gmat.cut.Ptrs().back();
 
   static double constexpr kEps = 1e-6;
@@ -244,7 +243,7 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
   bst_node_t nid = 0;
   HistogramBuilder histogram;
   HistMakerTrainParam hist_param;
-  histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, is_col_split, &hist_param);
+  histogram.Reset(ctx, total_bins, {kMaxBins, 0.5}, is_distributed, is_col_split, &hist_param);
 
   RegTree tree;
 
@@ -262,11 +261,11 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
   histogram.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false);
   common::BlockedSpace2d space{
       1, [&](std::size_t nidx_in_set) { return row_set_collection[nidx_in_set].Size(); }, 256};
-  for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(&ctx, {kMaxBins, 0.5})) {
+  for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx, {kMaxBins, 0.5})) {
     histogram.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
-                        linalg::MakeTensorView(&ctx, gpair, gpair.size()), force_read_by_column);
+                        linalg::MakeTensorView(ctx, gpair, gpair.size()), force_read_by_column);
   }
-  histogram.SyncHistogram(&ctx, &tree, nodes_to_build, {});
+  histogram.SyncHistogram(ctx, &tree, nodes_to_build, {});
 
   // Check if number of histogram bins is correct
   ASSERT_EQ(histogram.Histogram()[nid].size(), gmat.cut.Ptrs().back());
@@ -292,16 +291,21 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
 }
 
 TEST(CPUHistogram, BuildHist) {
-  TestBuildHistogram(true, false, false);
-  TestBuildHistogram(false, false, false);
-  TestBuildHistogram(true, true, false);
-  TestBuildHistogram(false, true, false);
+  Context ctx;
+  TestBuildHistogram(&ctx, true, false, false);
+  TestBuildHistogram(&ctx, false, false, false);
+  TestBuildHistogram(&ctx, true, true, false);
+  TestBuildHistogram(&ctx, false, true, false);
 }
 
-TEST(CPUHistogram, BuildHistColSplit) {
+TEST(CPUHistogram, BuildHistColumnSplit) {
   auto constexpr kWorkers = 4;
-  collective::TestDistributedGlobal(kWorkers, [] { TestBuildHistogram(true, true, true); });
-  collective::TestDistributedGlobal(kWorkers, [] { TestBuildHistogram(true, false, true); });
+  Context ctx;
+  std::int32_t n_total_threads = std::thread::hardware_concurrency();
+  auto n_threads = std::max(n_total_threads / kWorkers, 1);
+  ctx.UpdateAllowUnknown(Args{{"nthread", std::to_string(n_threads)}});
+  collective::TestDistributedGlobal(kWorkers, [&] { TestBuildHistogram(&ctx, true, true, true); });
+  collective::TestDistributedGlobal(kWorkers, [&] { TestBuildHistogram(&ctx, true, false, true); });
 }
 
 namespace {

From 033a666900a2b58f72a3050e63b02f1681e4f609 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 17 Aug 2024 01:35:47 +0800
Subject: [PATCH 04/19] [EM] Log the page size of ellpack. (#10713)

---
 src/common/timer.h              | 14 +++++---------
 src/data/ellpack_page.cu        | 11 +----------
 src/data/ellpack_page.cuh       |  3 +--
 src/data/ellpack_page_source.cu |  2 ++
 4 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/src/common/timer.h b/src/common/timer.h
index 80748e7b8264..3308dfb653dd 100644
--- a/src/common/timer.h
+++ b/src/common/timer.h
@@ -1,18 +1,15 @@
-/*!
- * Copyright by Contributors 2017-2019
+/**
+ * Copyright 2017-2024, XGBoost Contributors
  */
 #pragma once
 #include <xgboost/logging.h>
+
 #include <chrono>
-#include <iostream>
 #include <map>
 #include <string>
 #include <utility>
-#include <vector>
-
-namespace xgboost {
-namespace common {
 
+namespace xgboost::common {
 struct Timer {
   using ClockT = std::chrono::high_resolution_clock;
   using TimePointT = std::chrono::high_resolution_clock::time_point;
@@ -82,5 +79,4 @@ struct Monitor {
   void Start(const std::string &name);
   void Stop(const std::string &name);
 };
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 575bcd5ce9a5..b7ec72ad393c 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -356,7 +356,6 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
     : is_dense{page.IsDense()},
       base_rowid{page.base_rowid},
       n_rows{page.Size()},
-      // This makes a copy of the cut values.
       cuts_{std::make_shared<common::HistogramCuts>(page.cut)} {
   auto it = common::MakeIndexTransformIter(
       [&](size_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
@@ -540,15 +539,7 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
 // Return the number of rows contained in this page.
 [[nodiscard]] bst_idx_t EllpackPageImpl::Size() const { return n_rows; }
 
-// Return the memory cost for storing the compressed features.
-size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
-                                     const common::HistogramCuts& cuts) {
-  // Required buffer size for storing data matrix in EtoLLPack format.
-  size_t compressed_size_bytes =
-      common::CompressedBufferWriter::CalculateBufferSize(row_stride * num_rows,
-                                                          cuts.TotalBins() + 1);
-  return compressed_size_bytes;
-}
+std::size_t EllpackPageImpl::MemCostBytes() const { return this->gidx_buffer.size_bytes(); }
 
 EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
     DeviceOrd device, common::Span<FeatureType const> feature_types) const {
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index f11bdfae1d97..af11dec3fc15 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -217,8 +217,7 @@ class EllpackPageImpl {
 
   [[nodiscard]] bool IsDense() const { return is_dense; }
   /** @return Estimation of memory cost of this page. */
-  static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ;
-
+  std::size_t MemCostBytes() const;
 
   /**
    * @brief Return the total number of symbols (total number of bins plus 1 for not
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 7ab4819e14e1..2927d028cf79 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -172,6 +172,8 @@ void EllpackPageSourceImpl<F>::Fetch() {
     Context ctx = Context{}.MakeCUDA(this->Device().ordinal);
     *impl = EllpackPageImpl{&ctx, this->GetCuts(), *csr, is_dense_, row_stride_, feature_types_};
     this->page_->SetBaseRowId(csr->base_rowid);
+    LOG(INFO) << "Generated an Ellpack page with size: " << impl->MemCostBytes()
+              << " from a SparsePage with size:" << csr->MemCostBytes();
     this->WriteCache();
   }
 }

From 8d7fe262d98ae00ca941a478d4671825541942b3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 17 Aug 2024 02:59:45 +0800
Subject: [PATCH 05/19] [EM] Enable access to the number of batches. (#10691)

- Expose `NumBatches` in `DMatrix`.
- Small cleanup for removing legacy CUDA stream and ~force CUDA context initialization~.
- Purge old external memory data generation code.
---
 include/xgboost/data.h                        |   7 +-
 src/common/device_helpers.cuh                 |  26 +---
 src/data/extmem_quantile_dmatrix.cc           |   2 +
 src/data/extmem_quantile_dmatrix.h            |   3 +-
 src/data/iterative_dmatrix.h                  |   2 -
 src/data/proxy_dmatrix.h                      |   1 -
 src/data/simple_dmatrix.h                     |   1 -
 src/data/sparse_page_dmatrix.h                |   3 +-
 src/data/sparse_page_source.cc                |   8 +-
 src/data/sparse_page_source.cu                |  10 ++
 src/data/sparse_page_source.h                 |  11 +-
 tests/cpp/data/test_data.cc                   |  15 +-
 tests/cpp/data/test_ellpack_page.cu           |  16 +-
 tests/cpp/data/test_sparse_page_dmatrix.cc    |  18 +--
 tests/cpp/data/test_sparse_page_dmatrix.cu    |  39 ++---
 tests/cpp/gbm/test_gbtree.cc                  |   2 +-
 tests/cpp/helpers.cc                          | 144 +++---------------
 tests/cpp/helpers.cu                          |  13 --
 tests/cpp/helpers.h                           |  42 -----
 tests/cpp/plugin/test_sycl_predictor.cc       |   7 +-
 tests/cpp/predictor/test_cpu_predictor.cc     |   9 +-
 tests/cpp/predictor/test_gpu_predictor.cu     |   6 +-
 tests/cpp/test_learner.cc                     |  23 +--
 .../gpu_hist/test_gradient_based_sampler.cu   |  24 ++-
 tests/cpp/tree/hist/test_histogram.cc         |   3 +-
 tests/cpp/tree/test_gpu_hist.cu               |  84 +++++-----
 26 files changed, 168 insertions(+), 351 deletions(-)

diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index 10329f87b074..87d3be1fe34b 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -541,9 +541,12 @@ class DMatrix {
   [[nodiscard]] bool PageExists() const;
 
   /**
-   * @return Whether the data columns single column block.
+   * @return Whether the contains a single batch.
+   *
+   * The naming is legacy.
    */
-  [[nodiscard]] virtual bool SingleColBlock() const = 0;
+  [[nodiscard]] bool SingleColBlock() const { return this->NumBatches() == 1; }
+  [[nodiscard]] virtual std::int32_t NumBatches() const { return 1; }
 
   virtual ~DMatrix();
 
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 6cd0cd76a47a..3adc39e73777 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -486,24 +486,20 @@ class TypedDiscard : public thrust::discard_iterator<T> {
 } // namespace detail
 
 template <typename T>
-using TypedDiscard =
-    std::conditional_t<HasThrustMinorVer<12>(), detail::TypedDiscardCTK114<T>,
-                       detail::TypedDiscard<T>>;
+using TypedDiscard = std::conditional_t<HasThrustMinorVer<12>(), detail::TypedDiscardCTK114<T>,
+                                        detail::TypedDiscard<T>>;
 
 template <typename VectorT, typename T = typename VectorT::value_type,
-  typename IndexT = typename xgboost::common::Span<T>::index_type>
-xgboost::common::Span<T> ToSpan(
-    VectorT &vec,
-    IndexT offset = 0,
-    IndexT size = std::numeric_limits<size_t>::max()) {
+          typename IndexT = typename xgboost::common::Span<T>::index_type>
+xgboost::common::Span<T> ToSpan(VectorT &vec, IndexT offset = 0,
+                                IndexT size = std::numeric_limits<size_t>::max()) {
   size = size == std::numeric_limits<size_t>::max() ? vec.size() : size;
   CHECK_LE(offset + size, vec.size());
-  return {vec.data().get() + offset, size};
+  return {thrust::raw_pointer_cast(vec.data()) + offset, size};
 }
 
 template <typename T>
-xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
-                                size_t offset, size_t size) {
+xgboost::common::Span<T> ToSpan(thrust::device_vector<T> &vec, size_t offset, size_t size) {
   return ToSpan(vec, offset, size);
 }
 
@@ -874,13 +870,7 @@ inline void CUDAEvent::Record(CUDAStreamView stream) {  // NOLINT
 
 // Changing this has effect on prediction return, where we need to pass the pointer to
 // third-party libraries like cuPy
-inline CUDAStreamView DefaultStream() {
-#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
-  return CUDAStreamView{cudaStreamPerThread};
-#else
-  return CUDAStreamView{cudaStreamLegacy};
-#endif
-}
+inline CUDAStreamView DefaultStream() { return CUDAStreamView{cudaStreamPerThread}; }
 
 class CUDAStream {
   cudaStream_t stream_;
diff --git a/src/data/extmem_quantile_dmatrix.cc b/src/data/extmem_quantile_dmatrix.cc
index 0d17fcf55ae8..96e88a55a0e3 100644
--- a/src/data/extmem_quantile_dmatrix.cc
+++ b/src/data/extmem_quantile_dmatrix.cc
@@ -74,6 +74,8 @@ void ExtMemQuantileDMatrix::InitFromCPU(
   cpu_impl::GetDataShape(ctx, proxy, *iter, missing, &ext_info);
   ext_info.SetInfo(ctx, &this->info_);
 
+  this->n_batches_ = ext_info.n_batches;
+
   /**
    * Generate quantiles
    */
diff --git a/src/data/extmem_quantile_dmatrix.h b/src/data/extmem_quantile_dmatrix.h
index d3b9f5a7820a..33a80f5cda92 100644
--- a/src/data/extmem_quantile_dmatrix.h
+++ b/src/data/extmem_quantile_dmatrix.h
@@ -33,7 +33,7 @@ class ExtMemQuantileDMatrix : public QuantileDMatrix {
                         std::string cache, bst_bin_t max_bin, bool on_host);
   ~ExtMemQuantileDMatrix() override;
 
-  [[nodiscard]] bool SingleColBlock() const override { return false; }
+  [[nodiscard]] std::int32_t NumBatches() const override { return n_batches_; }
 
  private:
   void InitFromCPU(
@@ -63,6 +63,7 @@ class ExtMemQuantileDMatrix : public QuantileDMatrix {
   std::string cache_prefix_;
   bool on_host_;
   BatchParam batch_;
+  bst_idx_t n_batches_{0};
 
   using EllpackDiskPtr = std::shared_ptr<ExtEllpackPageSource>;
   using EllpackHostPtr = std::shared_ptr<ExtEllpackPageHostSource>;
diff --git a/src/data/iterative_dmatrix.h b/src/data/iterative_dmatrix.h
index 33350d372ac2..acec4708e634 100644
--- a/src/data/iterative_dmatrix.h
+++ b/src/data/iterative_dmatrix.h
@@ -57,8 +57,6 @@ class IterativeDMatrix : public QuantileDMatrix {
 
   BatchSet<EllpackPage> GetEllpackBatches(Context const *ctx, const BatchParam &param) override;
   BatchSet<ExtSparsePage> GetExtBatches(Context const *ctx, BatchParam const &param) override;
-
-  bool SingleColBlock() const override { return true; }
 };
 }  // namespace data
 }  // namespace xgboost
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index 8e62802c38aa..221e13fb32fc 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -94,7 +94,6 @@ class DMatrixProxy : public DMatrix {
   MetaInfo const& Info() const override { return info_; }
   Context const* Ctx() const override { return &ctx_; }
 
-  bool SingleColBlock() const override { return false; }
   bool EllpackExists() const override { return false; }
   bool GHistIndexExists() const override { return false; }
   bool SparsePageExists() const override { return false; }
diff --git a/src/data/simple_dmatrix.h b/src/data/simple_dmatrix.h
index 5b5bb2bfb2ba..ac757591cdb2 100644
--- a/src/data/simple_dmatrix.h
+++ b/src/data/simple_dmatrix.h
@@ -33,7 +33,6 @@ class SimpleDMatrix : public DMatrix {
   const MetaInfo& Info() const override;
   Context const* Ctx() const override { return &fmat_ctx_; }
 
-  bool SingleColBlock() const override { return true; }
   DMatrix* Slice(common::Span<int32_t const> ridxs) override;
   DMatrix* SliceCol(int num_slices, int slice_id) override;
 
diff --git a/src/data/sparse_page_dmatrix.h b/src/data/sparse_page_dmatrix.h
index 245ec0e4b5dc..f40c16f72488 100644
--- a/src/data/sparse_page_dmatrix.h
+++ b/src/data/sparse_page_dmatrix.h
@@ -90,8 +90,7 @@ class SparsePageDMatrix : public DMatrix {
   [[nodiscard]] MetaInfo &Info() override;
   [[nodiscard]] const MetaInfo &Info() const override;
   [[nodiscard]] Context const *Ctx() const override { return &fmat_ctx_; }
-  // The only DMatrix implementation that returns false.
-  [[nodiscard]] bool SingleColBlock() const override { return false; }
+  [[nodiscard]] std::int32_t NumBatches() const override { return n_batches_; }
   DMatrix *Slice(common::Span<std::int32_t const>) override {
     LOG(FATAL) << "Slicing DMatrix is not supported for external memory.";
     return nullptr;
diff --git a/src/data/sparse_page_source.cc b/src/data/sparse_page_source.cc
index 363c46f2d413..6247d66b37fc 100644
--- a/src/data/sparse_page_source.cc
+++ b/src/data/sparse_page_source.cc
@@ -3,10 +3,10 @@
  */
 #include "sparse_page_source.h"
 
-#include <filesystem>  // for exists
-#include <string>      // for string
 #include <cstdio>      // for remove
+#include <filesystem>  // for exists
 #include <numeric>     // for partial_sum
+#include <string>      // for string
 
 namespace xgboost::data {
 void Cache::Commit() {
@@ -27,4 +27,8 @@ void TryDeleteCacheFile(const std::string& file) {
                  << "; you may want to remove it manually";
   }
 }
+
+#if !defined(XGBOOST_USE_CUDA)
+void InitNewThread::operator()() const { *GlobalConfigThreadLocalStore::Get() = config; }
+#endif
 }  // namespace xgboost::data
diff --git a/src/data/sparse_page_source.cu b/src/data/sparse_page_source.cu
index 125b7f261616..84d6197e689c 100644
--- a/src/data/sparse_page_source.cu
+++ b/src/data/sparse_page_source.cu
@@ -18,4 +18,14 @@ void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
   cuda_impl::Dispatch(proxy,
                       [&](auto const &value) { CopyToSparsePage(value, device, missing, page); });
 }
+
+void InitNewThread::operator()() const {
+  *GlobalConfigThreadLocalStore::Get() = config;
+  // For CUDA 12.2, we need to force initialize the CUDA context by synchronizing the
+  // stream when creating a new thread in the thread pool. While for CUDA 11.8, this
+  // action might cause an insufficient driver version error for some reason. Lastly, it
+  // should work with CUDA 12.5 without any action being taken.
+
+  // dh::DefaultStream().Sync();
+}
 }  // namespace xgboost::data
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index e750f00fccdd..ca04e969fddf 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -210,6 +210,12 @@ class DefaultFormatPolicy {
   }
 };
 
+struct InitNewThread {
+  GlobalConfiguration config = *GlobalConfigThreadLocalStore::Get();
+
+  void operator()() const;
+};
+
 /**
  * @brief Base class for all page sources. Handles fetching, writing, and iteration.
  *
@@ -330,10 +336,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
  public:
   SparsePageSourceImpl(float missing, int nthreads, bst_feature_t n_features, bst_idx_t n_batches,
                        std::shared_ptr<Cache> cache)
-      : workers_{std::max(2, std::min(nthreads, 16)),
-                 [config = *GlobalConfigThreadLocalStore::Get()] {
-                   *GlobalConfigThreadLocalStore::Get() = config;
-                 }},
+      : workers_{std::max(2, std::min(nthreads, 16)), InitNewThread{}},
         missing_{missing},
         nthreads_{nthreads},
         n_features_{n_features},
diff --git a/tests/cpp/data/test_data.cc b/tests/cpp/data/test_data.cc
index f9e34790d4a3..49e43d8340e0 100644
--- a/tests/cpp/data/test_data.cc
+++ b/tests/cpp/data/test_data.cc
@@ -63,26 +63,27 @@ TEST(SparsePage, PushCSC) {
 }
 
 TEST(SparsePage, PushCSCAfterTranspose) {
-  size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
-  size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
-  std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
+  bst_idx_t constexpr kRows = 1024, kCols = 21;
+
+  auto dmat =
+      RandomDataGenerator{kRows, kCols, 0.0f}.Batches(4).GenerateSparsePageDMatrix("temp", true);
   const int ncols = dmat->Info().num_col_;
-  SparsePage page; // Consolidated sparse page
-  for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
+  SparsePage page;  // Consolidated sparse page
+  for (const auto& batch : dmat->GetBatches<xgboost::SparsePage>()) {
     // Transpose each batch and push
     SparsePage tmp = batch.GetTranspose(ncols, AllThreadsForTest());
     page.PushCSC(tmp);
   }
 
   // Make sure that the final sparse page has the right number of entries
-  ASSERT_EQ(kEntries, page.data.Size());
+  ASSERT_EQ(kRows * kCols, page.data.Size());
 
   page.SortRows(AllThreadsForTest());
   auto v = page.GetView();
   for (size_t i = 0; i < v.Size(); ++i) {
     auto column = v[i];
     for (size_t j = 1; j < column.size(); ++j) {
-      ASSERT_GE(column[j].fvalue, column[j-1].fvalue);
+      ASSERT_GE(column[j].fvalue, column[j - 1].fvalue);
     }
   }
 }
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
index 8aab51b7202e..f3957a002279 100644
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -140,13 +140,11 @@ struct ReadRowFunction {
 TEST(EllpackPage, Copy) {
   constexpr size_t kRows = 1024;
   constexpr size_t kCols = 16;
-  constexpr size_t kPageSize = 1024;
 
   // Create a DMatrix with multiple batches.
-  dmlc::TemporaryDirectory tmpdir;
-  std::unique_ptr<DMatrix>
-      dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
-  Context ctx{MakeCUDACtx(0)};
+  auto dmat =
+      RandomDataGenerator{kRows, kCols, 0.0f}.Batches(4).GenerateSparsePageDMatrix("temp", true);
+  auto ctx = MakeCUDACtx(0);
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
 
@@ -187,14 +185,12 @@ TEST(EllpackPage, Copy) {
 TEST(EllpackPage, Compact) {
   constexpr size_t kRows = 16;
   constexpr size_t kCols = 2;
-  constexpr size_t kPageSize = 1;
   constexpr size_t kCompactedRows = 8;
 
   // Create a DMatrix with multiple batches.
-  dmlc::TemporaryDirectory tmpdir;
-  std::unique_ptr<DMatrix> dmat(
-      CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
-  Context ctx{MakeCUDACtx(0)};
+  auto dmat =
+      RandomDataGenerator{kRows, kCols, 0.0f}.Batches(2).GenerateSparsePageDMatrix("temp", true);
+  auto ctx = MakeCUDACtx(0);
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
 
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index 3aeb42abce2b..b52d49176ef6 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -214,15 +214,15 @@ TEST(SparsePageDMatrix, MetaInfo) {
 }
 
 TEST(SparsePageDMatrix, RowAccess) {
-  std::unique_ptr<xgboost::DMatrix> dmat = xgboost::CreateSparsePageDMatrix(24);
+  auto dmat = RandomDataGenerator{12, 6, 0.8f}.Batches(2).GenerateSparsePageDMatrix("temp", false);
 
   // Test the data read into the first row
   auto &batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
   auto page = batch.GetView();
   auto first_row = page[0];
-  ASSERT_EQ(first_row.size(), 3ul);
-  EXPECT_EQ(first_row[2].index, 2u);
-  EXPECT_NEAR(first_row[2].fvalue, 0.986566, 1e-4);
+  ASSERT_EQ(first_row.size(), 1ul);
+  EXPECT_EQ(first_row[0].index, 5u);
+  EXPECT_NEAR(first_row[0].fvalue, 0.1805125, 1e-4);
 }
 
 TEST(SparsePageDMatrix, ColAccess) {
@@ -268,11 +268,10 @@ TEST(SparsePageDMatrix, ColAccess) {
 }
 
 TEST(SparsePageDMatrix, ThreadSafetyException) {
-  size_t constexpr kEntriesPerCol = 3;
-  size_t constexpr kEntries = 64 * kEntriesPerCol * 2;
   Context ctx;
 
-  std::unique_ptr<xgboost::DMatrix> dmat = xgboost::CreateSparsePageDMatrix(kEntries);
+  auto dmat =
+      RandomDataGenerator{4096, 12, 0.0f}.Batches(8).GenerateSparsePageDMatrix("temp", true);
 
   int threads = 1000;
 
@@ -304,10 +303,9 @@ TEST(SparsePageDMatrix, ThreadSafetyException) {
 
 // Multi-batches access
 TEST(SparsePageDMatrix, ColAccessBatches) {
-  size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
-  size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
   // Create multiple sparse pages
-  std::unique_ptr<xgboost::DMatrix> dmat{xgboost::CreateSparsePageDMatrix(kEntries)};
+  auto dmat =
+      RandomDataGenerator{1024, 32, 0.4f}.Batches(3).GenerateSparsePageDMatrix("temp", true);
   ASSERT_EQ(dmat->Ctx()->Threads(), AllThreadsForTest());
   Context ctx;
   for (auto const &page : dmat->GetBatches<xgboost::CSCPage>(&ctx)) {
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index 046c4eed4d80..f74ca28eb85e 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -115,13 +115,10 @@ TEST(SparsePageDMatrix, EllpackSkipSparsePage) {
 }
 
 TEST(SparsePageDMatrix, MultipleEllpackPages) {
-  Context ctx{MakeCUDACtx(0)};
+  auto ctx = MakeCUDACtx(0);
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   dmlc::TemporaryDirectory tmpdir;
-  std::string filename = tmpdir.path + "/big.libsvm";
-  size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
-  size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
-  std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries, filename);
+  auto dmat = RandomDataGenerator{1024, 2, 0.5f}.Batches(2).GenerateSparsePageDMatrix("temp", true);
 
   // Loop over the batches and count the records
   std::int64_t batch_count = 0;
@@ -135,15 +132,13 @@ TEST(SparsePageDMatrix, MultipleEllpackPages) {
   EXPECT_EQ(row_count, dmat->Info().num_row_);
 
   auto path =
-      data::MakeId(filename,
-                   dynamic_cast<data::SparsePageDMatrix *>(dmat.get())) +
-      ".ellpack.page";
+      data::MakeId("tmep", dynamic_cast<data::SparsePageDMatrix*>(dmat.get())) + ".ellpack.page";
 }
 
 TEST(SparsePageDMatrix, RetainEllpackPage) {
-  Context ctx{MakeCUDACtx(0)};
+  auto ctx = MakeCUDACtx(0);
   auto param = BatchParam{32, tree::TrainParam::DftSparseThreshold()};
-  auto m = CreateSparsePageDMatrix(10000);
+  auto m = RandomDataGenerator{2048, 4, 0.0f}.Batches(8).GenerateSparsePageDMatrix("temp", true);
 
   auto batches = m->GetBatches<EllpackPage>(&ctx, param);
   auto begin = batches.begin();
@@ -278,20 +273,19 @@ struct ReadRowFunction {
 };
 
 TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
-  constexpr size_t kRows = 6;
+  constexpr size_t kRows = 16;
   constexpr size_t kCols = 2;
   constexpr int kMaxBins = 256;
-  constexpr size_t kPageSize = 1;
 
   // Create an in-memory DMatrix.
-  std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
+  auto dmat =
+      RandomDataGenerator{kRows, kCols, 0.0f}.Batches(1).GenerateSparsePageDMatrix("temp", true);
 
   // Create a DMatrix with multiple batches.
-  dmlc::TemporaryDirectory tmpdir;
-  std::unique_ptr<DMatrix>
-      dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
+  auto dmat_ext =
+      RandomDataGenerator{kRows, kCols, 0.0f}.Batches(2).GenerateSparsePageDMatrix("temp", true);
 
-  Context ctx{MakeCUDACtx(0)};
+  auto ctx = MakeCUDACtx(0);
   auto param = BatchParam{kMaxBins, tree::TrainParam::DftSparseThreshold()};
   auto impl = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
   EXPECT_EQ(impl->base_rowid, 0);
@@ -325,17 +319,16 @@ TEST(SparsePageDMatrix, EllpackPageMultipleLoops) {
   constexpr size_t kRows = 1024;
   constexpr size_t kCols = 16;
   constexpr int kMaxBins = 256;
-  constexpr size_t kPageSize = 4096;
 
   // Create an in-memory DMatrix.
-  std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
+  auto dmat =
+      RandomDataGenerator{kRows, kCols, 0.0f}.Batches(1).GenerateSparsePageDMatrix("temp", true);
 
   // Create a DMatrix with multiple batches.
-  dmlc::TemporaryDirectory tmpdir;
-  std::unique_ptr<DMatrix>
-      dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
+  auto dmat_ext =
+      RandomDataGenerator{kRows, kCols, 0.0f}.Batches(8).GenerateSparsePageDMatrix("temp", true);
 
-  Context ctx{MakeCUDACtx(0)};
+  auto ctx = MakeCUDACtx(0);
   auto param = BatchParam{kMaxBins, tree::TrainParam::DftSparseThreshold()};
 
   size_t current_row = 0;
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index dcb89b97189c..8a5383ad4d34 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -715,7 +715,7 @@ TEST(GBTree, InplacePredictionError) {
       p_fmat = rng.GenerateQuantileDMatrix(true);
     } else {
 #if defined(XGBOOST_USE_CUDA)
-      p_fmat = rng.GenerateDeviceDMatrix(true);
+      p_fmat = rng.Device(ctx->Device()).GenerateQuantileDMatrix(true);
 #else
       CHECK(p_fmat);
 #endif  // defined(XGBOOST_USE_CUDA)
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 05f84316467c..ae5698d2cc6e 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -13,7 +13,6 @@
 
 #include <algorithm>
 #include <limits>  // for numeric_limits
-#include <random>
 
 #include "../../src/collective/communicator-inl.h"  // for GetRank
 #include "../../src/data/adapter.h"
@@ -21,8 +20,6 @@
 #include "../../src/data/simple_dmatrix.h"
 #include "../../src/data/sparse_page_dmatrix.h"
 #include "../../src/gbm/gbtree_model.h"
-#include "../../src/tree/param.h"  // for TrainParam
-#include "filesystem.h"            // dmlc::TemporaryDirectory
 #include "xgboost/c_api.h"
 #include "xgboost/predictor.h"
 
@@ -456,6 +453,7 @@ void RandomDataGenerator::GenerateCSR(
   }
 
   EXPECT_EQ(batch_count, n_batches_);
+  EXPECT_EQ(dmat->NumBatches(), n_batches_);
   EXPECT_EQ(row_count, dmat->Info().num_row_);
 
   if (with_label) {
@@ -503,13 +501,24 @@ void RandomDataGenerator::GenerateCSR(
 }
 
 std::shared_ptr<DMatrix> RandomDataGenerator::GenerateQuantileDMatrix(bool with_label) {
-  NumpyArrayIterForTest iter{this->sparsity_, this->rows_, this->cols_, 1};
-  auto m = std::make_shared<data::IterativeDMatrix>(
-      &iter, iter.Proxy(), nullptr, Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, bins_);
+  std::shared_ptr<data::IterativeDMatrix> p_fmat;
+
+  if (this->device_.IsCPU()) {
+    NumpyArrayIterForTest iter{this->sparsity_, this->rows_, this->cols_, 1};
+    p_fmat =
+        std::make_shared<data::IterativeDMatrix>(&iter, iter.Proxy(), nullptr, Reset, Next,
+                                                 std::numeric_limits<float>::quiet_NaN(), 0, bins_);
+  } else {
+    CudaArrayIterForTest iter{this->sparsity_, this->rows_, this->cols_, 1};
+    p_fmat =
+        std::make_shared<data::IterativeDMatrix>(&iter, iter.Proxy(), nullptr, Reset, Next,
+                                                 std::numeric_limits<float>::quiet_NaN(), 0, bins_);
+  }
+
   if (with_label) {
-    this->GenerateLabels(m);
+    this->GenerateLabels(p_fmat);
   }
-  return m;
+  return p_fmat;
 }
 
 #if !defined(XGBOOST_USE_CUDA)
@@ -551,125 +560,6 @@ std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float>& x, std::si
   return p_fmat;
 }
 
-std::unique_ptr<DMatrix> CreateSparsePageDMatrix(bst_idx_t n_samples, bst_feature_t n_features,
-                                                 size_t n_batches, std::string prefix) {
-  CHECK_GE(n_samples, n_batches);
-  NumpyArrayIterForTest iter(0, n_samples, n_features, n_batches);
-
-  std::unique_ptr<DMatrix> dmat{DMatrix::Create(
-      static_cast<DataIterHandle>(&iter), iter.Proxy(), Reset, Next,
-      std::numeric_limits<float>::quiet_NaN(), omp_get_max_threads(), prefix, false)};
-
-  auto row_page_path =
-      data::MakeId(prefix, dynamic_cast<data::SparsePageDMatrix*>(dmat.get())) + ".row.page";
-  EXPECT_TRUE(FileExists(row_page_path)) << row_page_path;
-
-  // Loop over the batches and count the number of pages
-  int64_t batch_count = 0;
-  int64_t row_count = 0;
-  for (const auto& batch : dmat->GetBatches<xgboost::SparsePage>()) {
-    batch_count++;
-    row_count += batch.Size();
-  }
-
-  EXPECT_GE(batch_count, n_batches);
-  EXPECT_EQ(row_count, dmat->Info().num_row_);
-  return dmat;
-}
-
-std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries,
-                                                 std::string prefix) {
-  size_t n_columns = 3;
-  size_t n_rows = n_entries / n_columns;
-  NumpyArrayIterForTest iter(0, n_rows, n_columns, 2);
-
-  std::unique_ptr<DMatrix> dmat{
-      DMatrix::Create(static_cast<DataIterHandle>(&iter), iter.Proxy(), Reset, Next,
-                      std::numeric_limits<float>::quiet_NaN(), 0, prefix, false)};
-  auto row_page_path =
-      data::MakeId(prefix,
-                   dynamic_cast<data::SparsePageDMatrix *>(dmat.get())) +
-      ".row.page";
-  EXPECT_TRUE(FileExists(row_page_path)) << row_page_path;
-
-  // Loop over the batches and count the records
-  int64_t batch_count = 0;
-  int64_t row_count = 0;
-  for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
-    batch_count++;
-    row_count += batch.Size();
-  }
-  EXPECT_GE(batch_count, 2);
-  EXPECT_EQ(row_count, dmat->Info().num_row_);
-  return dmat;
-}
-
-std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
-                                                       size_t page_size, bool deterministic,
-                                                       const dmlc::TemporaryDirectory& tempdir) {
-  if (!n_rows || !n_cols) {
-    return nullptr;
-  }
-
-  // Create the svm file in a temp dir
-  const std::string tmp_file = tempdir.path + "/big.libsvm";
-
-  std::ofstream fo(tmp_file.c_str());
-  size_t cols_per_row = ((std::max(n_rows, n_cols) - 1) / std::min(n_rows, n_cols)) + 1;
-  int64_t rem_cols = n_cols;
-  size_t col_idx = 0;
-
-  // Random feature id generator
-  std::random_device rdev;
-  std::unique_ptr<std::mt19937> gen;
-  if (deterministic) {
-     // Seed it with a constant value for this configuration - without getting too fancy
-     // like ordered pairing functions and its likes to make it truely unique
-     gen.reset(new std::mt19937(n_rows * n_cols));
-  } else {
-     gen.reset(new std::mt19937(rdev()));
-  }
-  std::uniform_int_distribution<size_t> label(0, 1);
-  std::uniform_int_distribution<size_t> dis(1, n_cols);
-
-  for (size_t i = 0; i < n_rows; ++i) {
-    // Make sure that all cols are slotted in the first few rows; randomly distribute the
-    // rest
-    std::stringstream row_data;
-    size_t j = 0;
-    if (rem_cols > 0) {
-      for (; j < std::min(static_cast<size_t>(rem_cols), cols_per_row); ++j) {
-        row_data << label(*gen) << " " << (col_idx + j) << ":"
-                 << (col_idx + j + 1) * 10 * i;
-      }
-      rem_cols -= cols_per_row;
-    } else {
-      // Take some random number of colums in [1, n_cols] and slot them here
-      std::vector<size_t> random_columns;
-      size_t ncols = dis(*gen);
-      for (; j < ncols; ++j) {
-        size_t fid = (col_idx + j) % n_cols;
-        random_columns.push_back(fid);
-      }
-      std::sort(random_columns.begin(), random_columns.end());
-      for (auto fid : random_columns) {
-        row_data << label(*gen) << " " << fid << ":" << (fid + 1) * 10 * i;
-      }
-    }
-    col_idx += j;
-
-    fo << row_data.str() << "\n";
-  }
-  fo.close();
-
-  std::string uri = tmp_file + "?format=libsvm";
-  if (page_size > 0) {
-    uri += "#" + tmp_file + ".cache";
-  }
-  std::unique_ptr<DMatrix> dmat(DMatrix::Load(uri));
-  return dmat;
-}
-
 std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs, size_t kRows,
                                                   size_t kCols,
                                                   LearnerModelParam const* learner_model_param,
diff --git a/tests/cpp/helpers.cu b/tests/cpp/helpers.cu
index f756289538ab..ef6beb33687b 100644
--- a/tests/cpp/helpers.cu
+++ b/tests/cpp/helpers.cu
@@ -3,12 +3,9 @@
  */
 #include <xgboost/c_api.h>
 
-#include "../../src/data/device_adapter.cuh"
-#include "../../src/data/iterative_dmatrix.h"
 #include "helpers.h"
 
 namespace xgboost {
-
 CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows,
                                            size_t cols, size_t batches)
     : ArrayIterForTest{sparsity, rows, cols, batches} {
@@ -26,14 +23,4 @@ int CudaArrayIterForTest::Next() {
   iter_++;
   return 1;
 }
-
-std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix(bool with_label) {
-  CudaArrayIterForTest iter{this->sparsity_, this->rows_, this->cols_, 1};
-  auto m = std::make_shared<data::IterativeDMatrix>(
-      &iter, iter.Proxy(), nullptr, Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, bins_);
-  if (with_label) {
-    this->GenerateLabels(m);
-  }
-  return m;
-}
 }  // namespace xgboost
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 50ae8bce076e..a8d5f370f3a2 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -324,9 +324,6 @@ class RandomDataGenerator {
   [[nodiscard]] std::shared_ptr<DMatrix> GenerateExtMemQuantileDMatrix(std::string prefix,
                                                                        bool with_label) const;
 
-#if defined(XGBOOST_USE_CUDA)
-  std::shared_ptr<DMatrix> GenerateDeviceDMatrix(bool with_label);
-#endif
   std::shared_ptr<DMatrix> GenerateQuantileDMatrix(bool with_label);
 };
 
@@ -350,45 +347,6 @@ inline std::vector<float> GenerateRandomCategoricalSingleColumn(int n, size_t nu
 std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float>& x, std::size_t num_rows,
                                             bst_feature_t num_columns);
 
-/**
- * \brief Create Sparse Page using data iterator.
- *
- * \param n_samples  Total number of rows for all batches combined.
- * \param n_features Number of features
- * \param n_batches  Number of batches
- * \param prefix     Cache prefix, can be used for specifying file path.
- *
- * \return A Sparse DMatrix with n_batches.
- */
-std::unique_ptr<DMatrix> CreateSparsePageDMatrix(bst_idx_t n_samples, bst_feature_t n_features,
-                                                 size_t n_batches, std::string prefix = "cache");
-
-/**
- * Deprecated, stop using it
- */
-std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries, std::string prefix = "cache");
-
-/**
- * Deprecated, stop using it
- *
- * \brief Creates dmatrix with some records, each record containing random number of
- *        features in [1, n_cols]
- *
- * \param n_rows      Number of records to create.
- * \param n_cols      Max number of features within that record.
- * \param page_size   Sparse page size for the pages within the dmatrix. If page size is 0
- *                    then the entire dmatrix is resident in memory; else, multiple sparse pages
- *                    of page size are created and backed to disk, which would have to be
- *                    streamed in at point of use.
- * \param deterministic The content inside the dmatrix is constant for this configuration, if true;
- *                      else, the content changes every time this method is invoked
- *
- * \return The new dmatrix.
- */
-std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
-    size_t n_rows, size_t n_cols, size_t page_size, bool deterministic,
-    const dmlc::TemporaryDirectory& tempdir = dmlc::TemporaryDirectory());
-
 std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs, size_t kRows,
                                                   size_t kCols,
                                                   LearnerModelParam const* learner_model_param,
diff --git a/tests/cpp/plugin/test_sycl_predictor.cc b/tests/cpp/plugin/test_sycl_predictor.cc
index 7bd788a3b071..a7ec51594e08 100755
--- a/tests/cpp/plugin/test_sycl_predictor.cc
+++ b/tests/cpp/plugin/test_sycl_predictor.cc
@@ -36,9 +36,10 @@ TEST(SyclPredictor, ExternalMemory) {
   Context ctx;
   ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
 
-  size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
-  size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
-  std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
+  bst_idx_t constexpr kRows{64};
+  bst_feature_t constexpr kCols{12};
+  auto dmat =
+      RandomDataGenerator{kRows, kCols, 0.5f}.Batches(3).GenerateSparsePageDMatrix("temp", true);
   TestBasic(dmat.get(), &ctx);
 }
 
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index c0d2c8e285af..ee28adb155c9 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -10,12 +10,10 @@
 #include "../../../src/gbm/gbtree.h"
 #include "../../../src/gbm/gbtree_model.h"
 #include "../collective/test_worker.h"  // for TestDistributedGlobal
-#include "../filesystem.h"  // dmlc::TemporaryDirectory
 #include "../helpers.h"
 #include "test_predictor.h"
 
 namespace xgboost {
-
 TEST(CpuPredictor, Basic) {
   Context ctx;
   size_t constexpr kRows = 5;
@@ -56,9 +54,10 @@ TEST(CpuPredictor, IterationRangeColmnSplit) {
 
 TEST(CpuPredictor, ExternalMemory) {
   Context ctx;
-  size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
-  size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
-  std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
+  bst_idx_t constexpr kRows{64};
+  bst_feature_t constexpr kCols{12};
+  auto dmat =
+      RandomDataGenerator{kRows, kCols, 0.5f}.Batches(3).GenerateSparsePageDMatrix("temp", true);
   TestBasic(dmat.get(), &ctx);
 }
 
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index 01de15fe8bc8..5e3021fd71e1 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -123,8 +123,8 @@ TEST(GPUPredictor, EllpackBasic) {
     size_t rows = bins * 16;
     auto p_m = RandomDataGenerator{rows, kCols, 0.0}
                    .Bins(bins)
-                   .Device(DeviceOrd::CUDA(0))
-                   .GenerateDeviceDMatrix(false);
+                   .Device(ctx.Device())
+                   .GenerateQuantileDMatrix(false);
     ASSERT_FALSE(p_m->PageExists<SparsePage>());
     TestPredictionFromGradientIndex<EllpackPage>(&ctx, rows, kCols, p_m);
     TestPredictionFromGradientIndex<EllpackPage>(&ctx, bins, kCols, p_m);
@@ -137,7 +137,7 @@ TEST(GPUPredictor, EllpackTraining) {
   auto p_ellpack = RandomDataGenerator{kRows, kCols, 0.0}
                        .Bins(kBins)
                        .Device(ctx.Device())
-                       .GenerateDeviceDMatrix(false);
+                       .GenerateQuantileDMatrix(false);
   HostDeviceVector<float> storage(kRows * kCols);
   auto columnar =
       RandomDataGenerator{kRows, kCols, 0.0}.Device(ctx.Device()).GenerateArrayInterface(&storage);
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index be11a2a765b7..a6f3eacecbc5 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -117,24 +117,15 @@ TEST(Learner, CheckGroup) {
   EXPECT_ANY_THROW(learner->UpdateOneIter(0, p_mat));
 }
 
-TEST(Learner, SLOW_CheckMultiBatch) {  // NOLINT
-  // Create sufficiently large data to make two row pages
-  dmlc::TemporaryDirectory tempdir;
-  const std::string tmp_file = tempdir.path + "/big.libsvm";
-  CreateBigTestData(tmp_file, 50000);
-  std::shared_ptr<DMatrix> dmat(
-      xgboost::DMatrix::Load(tmp_file + "?format=libsvm" + "#" + tmp_file + ".cache"));
-  EXPECT_FALSE(dmat->SingleColBlock());
-  size_t num_row = dmat->Info().num_row_;
-  std::vector<bst_float> labels(num_row);
-  for (size_t i = 0; i < num_row; ++i) {
-    labels[i] = i % 2;
-  }
-  dmat->SetInfo("label", Make1dInterfaceTest(labels.data(), num_row));
-  std::vector<std::shared_ptr<DMatrix>> mat{dmat};
+TEST(Learner, CheckMultiBatch) {
+  auto p_fmat =
+      RandomDataGenerator{512, 128, 0.8}.Batches(4).GenerateSparsePageDMatrix("temp", true);
+  ASSERT_FALSE(p_fmat->SingleColBlock());
+
+  std::vector<std::shared_ptr<DMatrix>> mat{p_fmat};
   auto learner = std::unique_ptr<Learner>(Learner::Create(mat));
   learner->SetParams(Args{{"objective", "binary:logistic"}});
-  learner->UpdateOneIter(0, dmat);
+  learner->UpdateOneIter(0, p_fmat);
 }
 
 TEST(Learner, Configuration) {
diff --git a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
index dcb09ff32315..b1e86e2ebbc2 100644
--- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
+++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
@@ -7,22 +7,18 @@
 #include "../../../../src/tree/gpu_hist/gradient_based_sampler.cuh"
 #include "../../../../src/tree/param.h"
 #include "../../../../src/tree/param.h"  // TrainParam
-#include "../../filesystem.h"            // dmlc::TemporaryDirectory
 #include "../../helpers.h"
 
 namespace xgboost::tree {
-void VerifySampling(size_t page_size,
-                    float subsample,
-                    int sampling_method,
-                    bool fixed_size_sampling = true,
-                    bool check_sum = true) {
+void VerifySampling(size_t page_size, float subsample, int sampling_method,
+                    bool fixed_size_sampling = true, bool check_sum = true) {
   constexpr size_t kRows = 4096;
   constexpr size_t kCols = 1;
-  size_t sample_rows = kRows * subsample;
+  bst_idx_t sample_rows = kRows * subsample;
+  bst_idx_t n_batches = fixed_size_sampling ? 1 : 4;
 
-  dmlc::TemporaryDirectory tmpdir;
-  std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrix(
-      kRows, kCols, kRows / (page_size == 0 ? kRows : page_size), tmpdir.path + "/cache"));
+  auto dmat = RandomDataGenerator{kRows, kCols, 0.0f}.Batches(n_batches).GenerateSparsePageDMatrix(
+      "temp", true);
   auto gpair = GenerateRandomGradients(kRows);
   GradientPair sum_gpair{};
   for (const auto& gp : gpair.ConstHostVector()) {
@@ -78,14 +74,12 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
   constexpr size_t kRows = 2048;
   constexpr size_t kCols = 1;
   constexpr float kSubsample = 1.0f;
-  constexpr size_t kPageSize = 1024;
 
   // Create a DMatrix with multiple batches.
-  dmlc::TemporaryDirectory tmpdir;
-  std::unique_ptr<DMatrix> dmat(
-      CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache"));
+  auto dmat =
+      RandomDataGenerator{kRows, kCols, 0.0f}.Batches(4).GenerateSparsePageDMatrix("temp", true);
   auto gpair = GenerateRandomGradients(kRows);
-  Context ctx{MakeCUDACtx(0)};
+  auto ctx = MakeCUDACtx(0);
   gpair.SetDevice(ctx.Device());
 
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index 11bdbd859b1c..5ab0c599ea6a 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -406,7 +406,8 @@ namespace {
 void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, bool is_approx,
                                  bool force_read_by_column) {
   size_t constexpr kEntries = 1 << 16;
-  auto m = CreateSparsePageDMatrix(kEntries, "cache");
+  auto m =
+      RandomDataGenerator{kEntries / 8, 8, 0.0f}.Batches(4).GenerateSparsePageDMatrix("temp", true);
 
   std::vector<float> hess(m->Info().num_row_, 1.0);
   if (is_approx) {
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 570ebe76c3da..61f7647579cf 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -17,12 +17,11 @@
 #include "../../../src/common/random.h"  // for GlobalRandom
 #include "../../../src/tree/param.h"     // for TrainParam
 #include "../collective/test_worker.h"   // for BaseMGPUTest
-#include "../filesystem.h"               // dmlc::TemporaryDirectory
 #include "../helpers.h"
 
 namespace xgboost::tree {
 namespace {
-void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat, bool is_ext,
+void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
                 RegTree* tree, HostDeviceVector<bst_float>* preds, float subsample,
                 const std::string& sampling_method, bst_bin_t max_bin) {
   Args args{
@@ -45,7 +44,7 @@ void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix
   hist_maker->Update(&param, gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
                      {tree});
   auto cache = linalg::MakeTensorView(ctx, preds->DeviceSpan(), preds->Size(), 1);
-  if (subsample < 1.0 && is_ext) {
+  if (subsample < 1.0 && !dmat->SingleColBlock()) {
     ASSERT_FALSE(hist_maker->UpdatePredictionCache(dmat, cache));
   } else {
     ASSERT_TRUE(hist_maker->UpdatePredictionCache(dmat, cache));
@@ -58,22 +57,23 @@ TEST(GpuHist, UniformSampling) {
   constexpr size_t kCols = 2;
   constexpr float kSubsample = 0.9999;
   common::GlobalRandom().seed(1994);
+  auto ctx = MakeCUDACtx(0);
 
   // Create an in-memory DMatrix.
-  std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
+  auto p_fmat = RandomDataGenerator{kRows, kCols, 0.0f}.GenerateDMatrix(true);
+  ASSERT_TRUE(p_fmat->SingleColBlock());
 
-  linalg::Matrix<GradientPair> gpair({kRows}, Context{}.MakeCUDA().Device());
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
-  HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
-  Context ctx(MakeCUDACtx(0));
-  UpdateTree(&ctx, &gpair, dmat.get(), false, &tree, &preds, 1.0, "uniform", kRows);
+  HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
+  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows);
   // Build another tree using sampling.
   RegTree tree_sampling;
-  HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, DeviceOrd::CUDA(0));
-  UpdateTree(&ctx, &gpair, dmat.get(), false, &tree_sampling, &preds_sampling, kSubsample, "uniform",
+  HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, ctx.Device());
+  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree_sampling, &preds_sampling, kSubsample, "uniform",
              kRows);
 
   // Make sure the predictions are the same.
@@ -89,23 +89,23 @@ TEST(GpuHist, GradientBasedSampling) {
   constexpr size_t kCols = 2;
   constexpr float kSubsample = 0.9999;
   common::GlobalRandom().seed(1994);
+  auto ctx = MakeCUDACtx(0);
 
   // Create an in-memory DMatrix.
-  std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
+  auto p_fmat = RandomDataGenerator{kRows, kCols, 0.0f}.GenerateDMatrix(true);
 
-  linalg::Matrix<GradientPair> gpair({kRows}, MakeCUDACtx(0).Device());
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
-  HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
-  Context ctx(MakeCUDACtx(0));
-  UpdateTree(&ctx, &gpair, dmat.get(), false, &tree, &preds, 1.0, "uniform", kRows);
+  HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
+  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows);
 
   // Build another tree using sampling.
   RegTree tree_sampling;
-  HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, DeviceOrd::CUDA(0));
-  UpdateTree(&ctx, &gpair, dmat.get(), false, &tree_sampling, &preds_sampling, kSubsample,
+  HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, ctx.Device());
+  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree_sampling, &preds_sampling, kSubsample,
              "gradient_based", kRows);
 
   // Make sure the predictions are the same.
@@ -119,29 +119,29 @@ TEST(GpuHist, GradientBasedSampling) {
 TEST(GpuHist, ExternalMemory) {
   constexpr size_t kRows = 4096;
   constexpr size_t kCols = 2;
-  constexpr size_t kPageSize = 1024;
-
-  dmlc::TemporaryDirectory tmpdir;
 
   // Create a DMatrix with multiple batches.
-  std::unique_ptr<DMatrix> dmat_ext(
-      CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache"));
+  auto p_fmat_ext =
+      RandomDataGenerator{kRows, kCols, 0.0f}.Batches(4).GenerateSparsePageDMatrix("temp", true);
+  ASSERT_FALSE(p_fmat_ext->SingleColBlock());
 
   // Create a single batch DMatrix.
-  std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrix(kRows, kCols, 1, tmpdir.path + "/cache"));
+  auto p_fmat =
+      RandomDataGenerator{kRows, kCols, 0.0f}.Batches(1).GenerateSparsePageDMatrix("temp", true);
+  ASSERT_TRUE(p_fmat->SingleColBlock());
 
-  Context ctx(MakeCUDACtx(0));
+  auto ctx = MakeCUDACtx(0);
   linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
-  HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
-  UpdateTree(&ctx, &gpair, dmat.get(), false, &tree, &preds, 1.0, "uniform", kRows);
+  HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
+  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows);
   // Build another tree using multiple ELLPACK pages.
   RegTree tree_ext;
-  HostDeviceVector<bst_float> preds_ext(kRows, 0.0, DeviceOrd::CUDA(0));
-  UpdateTree(&ctx, &gpair, dmat_ext.get(), true, &tree_ext, &preds_ext, 1.0, "uniform", kRows);
+  HostDeviceVector<bst_float> preds_ext(kRows, 0.0, ctx.Device());
+  UpdateTree(&ctx, &gpair, p_fmat_ext.get(), &tree_ext, &preds_ext, 1.0, "uniform", kRows);
 
   // Make sure the predictions are the same.
   auto preds_h = preds.ConstHostVector();
@@ -157,20 +157,21 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
   const std::string kSamplingMethod = "gradient_based";
   common::GlobalRandom().seed(0);
 
-  dmlc::TemporaryDirectory tmpdir;
-  Context ctx(MakeCUDACtx(0));
+  auto ctx = MakeCUDACtx(0);
 
   // Create a single batch DMatrix.
   auto p_fmat = RandomDataGenerator{kRows, kCols, 0.0f}
                     .Device(ctx.Device())
                     .Batches(1)
                     .GenerateSparsePageDMatrix("temp", true);
+  ASSERT_TRUE(p_fmat->SingleColBlock());
 
   // Create a DMatrix with multiple batches.
   auto p_fmat_ext = RandomDataGenerator{kRows, kCols, 0.0f}
                         .Device(ctx.Device())
                         .Batches(4)
                         .GenerateSparsePageDMatrix("temp", true);
+  ASSERT_FALSE(p_fmat_ext->SingleColBlock());
 
   linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
@@ -179,26 +180,25 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
   auto rng = common::GlobalRandom();
 
   RegTree tree;
-  HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
-  UpdateTree(&ctx, &gpair, p_fmat.get(), true, &tree, &preds, kSubsample, kSamplingMethod, kRows);
+  HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
+  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, kSubsample, kSamplingMethod, kRows);
 
   // Build another tree using multiple ELLPACK pages.
   common::GlobalRandom() = rng;
   RegTree tree_ext;
-  HostDeviceVector<bst_float> preds_ext(kRows, 0.0, DeviceOrd::CUDA(0));
-  UpdateTree(&ctx, &gpair, p_fmat_ext.get(), true, &tree_ext, &preds_ext, kSubsample,
-             kSamplingMethod, kRows);
+  HostDeviceVector<bst_float> preds_ext(kRows, 0.0, ctx.Device());
+  UpdateTree(&ctx, &gpair, p_fmat_ext.get(), &tree_ext, &preds_ext, kSubsample, kSamplingMethod,
+             kRows);
 
-  // Make sure the predictions are the same.
-  auto preds_h = preds.ConstHostVector();
-  auto preds_ext_h = preds_ext.ConstHostVector();
-  for (size_t i = 0; i < kRows; i++) {
-    ASSERT_NEAR(preds_h[i], preds_ext_h[i], 1e-3);
-  }
+  Json jtree{Object{}};
+  Json jtree_ext{Object{}};
+  tree.SaveModel(&jtree);
+  tree_ext.SaveModel(&jtree_ext);
+  ASSERT_EQ(jtree, jtree_ext);
 }
 
 TEST(GpuHist, ConfigIO) {
-  Context ctx(MakeCUDACtx(0));
+  auto ctx = MakeCUDACtx(0);
   ObjInfo task{ObjInfo::kRegression};
   std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_gpu_hist", &ctx, &task)};
   updater->Configure(Args{});

From ec3f327c202bf869b7aa9bc586a278e0c89c438d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 17 Aug 2024 03:02:34 +0800
Subject: [PATCH 06/19] Add managed memory allocator. (#10711)

---
 .../xgboost4j/src/native/xgboost4j-gpu.cu     |   2 +-
 src/common/cuda_pinned_allocator.h            | 139 ++++++++++--------
 src/data/ellpack_page_source.cu               |   2 +-
 src/tree/gpu_hist/evaluate_splits.cuh         |   2 +-
 tests/cpp/common/test_cuda_host_allocator.cu  |  36 +++++
 5 files changed, 119 insertions(+), 62 deletions(-)
 create mode 100644 tests/cpp/common/test_cuda_host_allocator.cu

diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu b/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu
index a705751b1583..bd428189f851 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu
@@ -132,7 +132,7 @@ class DataIteratorProxy {
   bool cache_on_host_{true}; // TODO(Bobby): Make this optional.
 
   template <typename T>
-  using Alloc = xgboost::common::cuda::pinned_allocator<T>;
+  using Alloc = xgboost::common::cuda_impl::pinned_allocator<T>;
   template <typename U>
   using HostVector = std::vector<U, Alloc<U>>;
 
diff --git a/src/common/cuda_pinned_allocator.h b/src/common/cuda_pinned_allocator.h
index 6fe1757fd369..c53ae4517c33 100644
--- a/src/common/cuda_pinned_allocator.h
+++ b/src/common/cuda_pinned_allocator.h
@@ -1,20 +1,19 @@
-/*!
- * Copyright 2022 by XGBoost Contributors
- * \file common.h
- * \brief cuda pinned allocator for usage with thrust containers
+/**
+ * Copyright 2022-2024, XGBoost Contributors
+ *
+ * @brief cuda pinned allocator for usage with thrust containers
  */
 
 #pragma once
 
-#include <cstddef>
-#include <limits>
+#include <cuda_runtime.h>
 
-#include "common.h"
+#include <cstddef>  // for size_t
+#include <limits>   // for numeric_limits
 
-namespace xgboost {
-namespace common {
-namespace cuda {
+#include "common.h"
 
+namespace xgboost::common::cuda_impl {
 // \p pinned_allocator is a CUDA-specific host memory allocator
 //  that employs \c cudaMallocHost for allocation.
 //
@@ -22,72 +21,94 @@ namespace cuda {
 // that Thrust used to provide.
 //
 //  \see https://en.cppreference.com/w/cpp/memory/allocator
+
 template <typename T>
-class pinned_allocator;
+struct PinnedAllocPolicy {
+  using pointer = T*;              // NOLINT: The type returned by address() / allocate()
+  using const_pointer = const T*;  // NOLINT: The type returned by address()
+  using size_type = std::size_t;   // NOLINT: The type used for the size of the allocation
+  using value_type = T;            // NOLINT: The type of the elements in the allocator
+
+  size_type max_size() const {  // NOLINT
+    return std::numeric_limits<size_type>::max() / sizeof(value_type);
+  }
 
-template <>
-class pinned_allocator<void> {
- public:
-  using value_type      = void;            // NOLINT: The type of the elements in the allocator
-  using pointer         = void*;           // NOLINT: The type returned by address() / allocate()
-  using const_pointer   = const void*;     // NOLINT: The type returned by address()
-  using size_type       = std::size_t;     // NOLINT: The type used for the size of the allocation
-  using difference_type = std::ptrdiff_t;  // NOLINT: The type of the distance between two pointers
+  pointer allocate(size_type cnt, const_pointer = nullptr) {  // NOLINT
+    if (cnt > this->max_size()) {
+      throw std::bad_alloc{};
+    }  // end if
 
-  template <typename U>
-  struct rebind {                       // NOLINT
-    using other = pinned_allocator<U>;  // NOLINT: The rebound type
-  };
-};
+    pointer result(nullptr);
+    dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
+    return result;
+  }
 
+  void deallocate(pointer p, size_type) { dh::safe_cuda(cudaFreeHost(p)); }  // NOLINT
+};
 
 template <typename T>
-class pinned_allocator {
+struct ManagedAllocPolicy {
+  using pointer = T*;              // NOLINT: The type returned by address() / allocate()
+  using const_pointer = const T*;  // NOLINT: The type returned by address()
+  using size_type = std::size_t;   // NOLINT: The type used for the size of the allocation
+  using value_type = T;            // NOLINT: The type of the elements in the allocator
+
+  size_type max_size() const {  // NOLINT
+    return std::numeric_limits<size_type>::max() / sizeof(value_type);
+  }
+
+  pointer allocate(size_type cnt, const_pointer = nullptr) {  // NOLINT
+    if (cnt > this->max_size()) {
+      throw std::bad_alloc{};
+    }  // end if
+
+    pointer result(nullptr);
+    dh::safe_cuda(cudaMallocManaged(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
+    return result;
+  }
+
+  void deallocate(pointer p, size_type) { dh::safe_cuda(cudaFree(p)); }  // NOLINT
+};
+
+template <typename T, template <typename> typename Policy>
+class CudaHostAllocatorImpl : public Policy<T> {  // NOLINT
  public:
-  using value_type      = T;               // NOLINT: The type of the elements in the allocator
-  using pointer         = T*;              // NOLINT: The type returned by address() / allocate()
-  using const_pointer   = const T*;        // NOLINT: The type returned by address()
-  using reference       = T&;              // NOLINT: The parameter type for address()
-  using const_reference = const T&;        // NOLINT: The parameter type for address()
-  using size_type       = std::size_t;     // NOLINT: The type used for the size of the allocation
+  using value_type = typename Policy<T>::value_type;        // NOLINT
+  using pointer = typename Policy<T>::pointer;              // NOLINT
+  using const_pointer = typename Policy<T>::const_pointer;  // NOLINT
+  using size_type = typename Policy<T>::size_type;          // NOLINT
+
+  using reference = T&;              // NOLINT: The parameter type for address()
+  using const_reference = const T&;  // NOLINT: The parameter type for address()
+
   using difference_type = std::ptrdiff_t;  // NOLINT: The type of the distance between two pointers
 
   template <typename U>
-  struct rebind {                       // NOLINT
-    using other = pinned_allocator<U>;  // NOLINT: The rebound type
+  struct rebind {                                    // NOLINT
+    using other = CudaHostAllocatorImpl<U, Policy>;  // NOLINT: The rebound type
   };
 
-  XGBOOST_DEVICE inline pinned_allocator() {}; // NOLINT: host/device markup ignored on defaulted functions
-  XGBOOST_DEVICE inline ~pinned_allocator() {} // NOLINT: host/device markup ignored on defaulted functions
-  XGBOOST_DEVICE inline pinned_allocator(pinned_allocator const&) {} // NOLINT: host/device markup ignored on defaulted functions
+  CudaHostAllocatorImpl() = default;
+  ~CudaHostAllocatorImpl() = default;
+  CudaHostAllocatorImpl(CudaHostAllocatorImpl const&) = default;
 
-  pinned_allocator& operator=(pinned_allocator const& that) = default;
-  pinned_allocator& operator=(pinned_allocator&& that) = default;
+  CudaHostAllocatorImpl& operator=(CudaHostAllocatorImpl const& that) = default;
+  CudaHostAllocatorImpl& operator=(CudaHostAllocatorImpl&& that) = default;
 
   template <typename U>
-  XGBOOST_DEVICE inline pinned_allocator(pinned_allocator<U> const&) {} // NOLINT
-
-  XGBOOST_DEVICE inline pointer address(reference r) { return &r; } // NOLINT
-  XGBOOST_DEVICE inline const_pointer address(const_reference r) { return &r; } // NOLINT
-
-  inline pointer allocate(size_type cnt, const_pointer = nullptr) { // NOLINT
-    if (cnt > this->max_size()) { throw std::bad_alloc(); }  // end if
+  CudaHostAllocatorImpl(CudaHostAllocatorImpl<U, Policy> const&) {}  // NOLINT
 
-    pointer result(nullptr);
-    dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
-    return result;
-  }
+  pointer address(reference r) { return &r; }              // NOLINT
+  const_pointer address(const_reference r) { return &r; }  // NOLINT
 
-  inline void deallocate(pointer p, size_type) { dh::safe_cuda(cudaFreeHost(p)); } // NOLINT
+  bool operator==(CudaHostAllocatorImpl const& x) const { return true; }
 
-  inline size_type max_size() const { return (std::numeric_limits<size_type>::max)() / sizeof(T); } // NOLINT
+  bool operator!=(CudaHostAllocatorImpl const& x) const { return !operator==(x); }
+};
 
-  XGBOOST_DEVICE inline bool operator==(pinned_allocator const& x) const { return true; }
+template <typename T>
+using pinned_allocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>;  // NOLINT
 
-  XGBOOST_DEVICE inline bool operator!=(pinned_allocator const& x) const {
-    return !operator==(x);
-  }
-};
-}  // namespace cuda
-}  // namespace common
-}  // namespace xgboost
+template <typename T>
+using managed_allocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>;  // NOLINT
+}  // namespace xgboost::common::cuda_impl
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 2927d028cf79..342ac8da779d 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -20,7 +20,7 @@
 
 namespace xgboost::data {
 struct EllpackHostCache {
-  thrust::host_vector<std::int8_t, common::cuda::pinned_allocator<std::int8_t>> cache;
+  thrust::host_vector<std::int8_t, common::cuda_impl::pinned_allocator<std::int8_t>> cache;
 
   void Resize(std::size_t n, dh::CUDAStreamView stream) {
     stream.Sync();  // Prevent partial copy inside resize.
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index 8c387f6324bb..e82bcbf82495 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -57,7 +57,7 @@ struct CatAccessor {
 class GPUHistEvaluator {
   using CatST = common::CatBitField::value_type;  // categorical storage type
   // use pinned memory to stage the categories, used for sort based splits.
-  using Alloc = xgboost::common::cuda::pinned_allocator<CatST>;
+  using Alloc = xgboost::common::cuda_impl::pinned_allocator<CatST>;
 
  private:
   TreeEvaluator tree_evaluator_;
diff --git a/tests/cpp/common/test_cuda_host_allocator.cu b/tests/cpp/common/test_cuda_host_allocator.cu
new file mode 100644
index 000000000000..c8e25564ad98
--- /dev/null
+++ b/tests/cpp/common/test_cuda_host_allocator.cu
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2024, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/context.h>  // for Context
+
+#include <vector>
+
+#include "../../../src/common/cuda_pinned_allocator.h"
+#include "../../../src/common/device_helpers.cuh"  // for DefaultStream
+#include "../../../src/common/numeric.h"           // for Iota
+
+namespace xgboost {
+TEST(CudaHostMalloc, Pinned) {
+  std::vector<float, common::cuda_impl::pinned_allocator<float>> vec;
+  vec.resize(10);
+  ASSERT_EQ(vec.size(), 10);
+  Context ctx;
+  common::Iota(&ctx, vec.begin(), vec.end(), 0);
+  float k = 0;
+  for (auto v : vec) {
+    ASSERT_EQ(v, k);
+    ++k;
+  }
+}
+
+TEST(CudaHostMalloc, Managed) {
+  std::vector<float, common::cuda_impl::managed_allocator<float>> vec;
+  vec.resize(10);
+#if defined(__linux__)
+  dh::safe_cuda(
+      cudaMemPrefetchAsync(vec.data(), vec.size() * sizeof(float), 0, dh::DefaultStream()));
+#endif
+  dh::DefaultStream().Sync();
+}
+}  // namespace xgboost

From fd365c147ef36fcc4ea5caa3a5625842e2515a01 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 17 Aug 2024 04:21:39 +0800
Subject: [PATCH 07/19] [doc] Brief note about RMM SAM allocator. [skip ci]
 (#10712)

---
 demo/rmm_plugin/README.rst | 18 +++++++++++++++++-
 doc/gpu/index.rst          |  5 +++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/demo/rmm_plugin/README.rst b/demo/rmm_plugin/README.rst
index 28b816eb2574..809d7aebd22b 100644
--- a/demo/rmm_plugin/README.rst
+++ b/demo/rmm_plugin/README.rst
@@ -58,4 +58,20 @@ Since with RMM the memory pool is pre-allocated on a specific device, changing t
 device ordinal in XGBoost can result in memory error ``cudaErrorIllegalAddress``. Use the
 ``CUDA_VISIBLE_DEVICES`` environment variable instead of the ``device="cuda:1"`` parameter
 for selecting device. For distributed training, the distributed computing frameworks like
-``dask-cuda`` are responsible for device management.
\ No newline at end of file
+``dask-cuda`` are responsible for device management.
+
+************************
+Memory Over-Subscription
+************************
+
+.. warning::
+
+   This feature is still experimental and is under active development.
+
+The newer NVIDIA platforms like `Grace-Hopper
+<https://www.nvidia.com/en-us/data-center/grace-hopper-superchip/>`__ use `NVLink-C2C
+<https://www.nvidia.com/en-us/data-center/nvlink-c2c/>`__, which allows the CPU and GPU to
+have a coherent memory model. Users can use the `SamHeadroomMemoryResource` in the latest
+RMM to utilize system memory for storing data. This can help XGBoost utilize memory from
+the host for GPU computation, but it may reduce performance due to slower CPU memory speed
+and page migration overhead.
\ No newline at end of file
diff --git a/doc/gpu/index.rst b/doc/gpu/index.rst
index 468362302f09..13e8c9e14918 100644
--- a/doc/gpu/index.rst
+++ b/doc/gpu/index.rst
@@ -50,6 +50,11 @@ Multi-node Multi-GPU Training
 
 XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`_, ``Spark`` and ``PySpark``. For getting started with Dask see our tutorial :doc:`/tutorials/dask` and worked examples :doc:`/python/dask-examples/index`, also Python documentation :ref:`dask_api` for complete reference. For usage with ``Spark`` using Scala see :doc:`/jvm/xgboost4j_spark_gpu_tutorial`. Lastly for distributed GPU training with ``PySpark``, see :doc:`/tutorials/spark_estimator`.
 
+RMM integration
+===============
+
+XGBoost provides optional support for RMM integration. See :doc:`/python/rmm-examples/index` for more info.
+
 
 Memory usage
 ============

From caabee2135af58cdb31840734d1ba54958df0a2a Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Sun, 18 Aug 2024 17:31:38 +0200
Subject: [PATCH 08/19] [R] remove 'reshape' argument, let shapes be handled by
 core cpp library (#10330)

---
 R-package/R/callbacks.R                       |   3 +-
 R-package/R/utils.R                           |   5 +-
 R-package/R/xgb.Booster.R                     | 181 ++++++++----------
 R-package/R/xgb.plot.shap.R                   |  48 +++--
 R-package/man/predict.xgb.Booster.Rd          |  91 +++++----
 R-package/src/init.c                          |   2 -
 R-package/src/xgboost_R.cc                    |  25 +--
 R-package/src/xgboost_R.h                     |   8 -
 R-package/tests/testthat/test_basic.R         |  50 +++--
 .../tests/testthat/test_booster_slicing.R     |  18 +-
 R-package/tests/testthat/test_dmatrix.R       |   6 +-
 R-package/tests/testthat/test_helpers.R       |  24 +--
 R-package/tests/testthat/test_interactions.R  |  28 +--
 13 files changed, 240 insertions(+), 249 deletions(-)

diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R
index 39734ab092d3..a00678a2f4dd 100644
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@@ -853,8 +853,7 @@ xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) {
         pr <- predict(
           fd$bst,
           fd$evals[[2L]],
-          outputmargin = env$outputmargin,
-          reshape = TRUE
+          outputmargin = env$outputmargin
         )
         if (is.null(pred)) {
           if (NCOL(pr) > 1L) {
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index d08a411e2d18..8b87468a4e8c 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -199,8 +199,7 @@ xgb.iter.update <- function(bst, dtrain, iter, obj) {
       bst,
       dtrain,
       outputmargin = TRUE,
-      training = TRUE,
-      reshape = TRUE
+      training = TRUE
     )
     gpair <- obj(pred, dtrain)
     n_samples <- dim(dtrain)[1]
@@ -246,7 +245,7 @@ xgb.iter.eval <- function(bst, evals, iter, feval) {
     res <- sapply(seq_along(evals), function(j) {
       w <- evals[[j]]
       ## predict using all trees
-      preds <- predict(bst, w, outputmargin = TRUE, reshape = TRUE, iterationrange = "all")
+      preds <- predict(bst, w, outputmargin = TRUE, iterationrange = "all")
       eval_res <- feval(preds, w)
       out <- eval_res$value
       names(out) <- paste0(evnames[j], "-", eval_res$metric)
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index cfea11ae33c6..4b2ba60c388f 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -112,9 +112,6 @@ xgb.get.handle <- function(object) {
 #' @param predcontrib Whether to return feature contributions to individual predictions (see Details).
 #' @param approxcontrib Whether to use a fast approximation for feature contributions (see Details).
 #' @param predinteraction Whether to return contributions of feature interactions to individual predictions (see Details).
-#' @param reshape Whether to reshape the vector of predictions to matrix form when there are several
-#'        prediction outputs per case. No effect if `predleaf`, `predcontrib`,
-#'        or `predinteraction` is `TRUE`.
 #' @param training Whether the prediction result is used for training. For dart booster,
 #'        training predicting will perform dropout.
 #' @param iterationrange Sequence of rounds/iterations from the model to use for prediction, specified by passing
@@ -128,8 +125,24 @@ xgb.get.handle <- function(object) {
 #'        of the iterations (rounds) otherwise.
 #'
 #'        If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
-#' @param strict_shape Default is `FALSE`. When set to `TRUE`, the output
-#'        type and shape of predictions are invariant to the model type.
+#' @param strict_shape Whether to always return an array with the same dimensions for the given prediction mode
+#'        regardless of the model type - meaning that, for example, both a multi-class and a binary classification
+#'        model would generate output arrays with the same number of dimensions, with the 'class' dimension having
+#'        size equal to '1' for the binary model.
+#'
+#'        If passing `FALSE` (the default), dimensions will be simplified according to the model type, so that a
+#'        binary classification model for example would not have a redundant dimension for 'class'.
+#'
+#'        See documentation for the return type for the exact shape of the output arrays for each prediction mode.
+#' @param avoid_transpose Whether to output the resulting predictions in the same memory layout in which they
+#'        are generated by the core XGBoost library, without transposing them to match the expected output shape.
+#'
+#'        Internally, XGBoost uses row-major order for the predictions it generates, while R arrays use column-major
+#'        order, hence the result needs to be transposed in order to have the expected shape when represented as
+#'        an R array or matrix, which might be a slow operation.
+#'
+#'        If passing `TRUE`, then the result will have dimensions in reverse order - for example, rows
+#'        will be the last dimensions instead of the first dimension.
 #' @param base_margin Base margin used for boosting from existing model.
 #'
 #'        Note that, if `newdata` is an `xgb.DMatrix` object, this argument will
@@ -180,28 +193,46 @@ xgb.get.handle <- function(object) {
 #' Note that converting a matrix to [xgb.DMatrix()] uses multiple threads too.
 #'
 #' @return
-#' The return type depends on `strict_shape`. If `FALSE` (default):
-#' - For regression or binary classification: A vector of length `nrows(newdata)`.
-#' - For multiclass classification: A vector of length `num_class * nrows(newdata)` or
-#'   a `(nrows(newdata), num_class)` matrix, depending on the `reshape` value.
-#' - When `predleaf = TRUE`: A matrix with one column per tree.
-#' - When `predcontrib = TRUE`: When not multiclass, a matrix with
-#' ` num_features + 1` columns. The last "+ 1" column corresponds to the baseline value.
-#'   In the multiclass case, a list of `num_class` such matrices.
-#'   The contribution values are on the scale of untransformed margin
-#'   (e.g., for binary classification, the values are log-odds deviations from the baseline).
-#' - When `predinteraction = TRUE`: When not multiclass, the output is a 3d array of
-#'   dimension `c(nrow, num_features + 1, num_features + 1)`. The off-diagonal (in the last two dimensions)
-#'   elements represent different feature interaction contributions. The array is symmetric WRT the last
-#'   two dimensions. The "+ 1" columns corresponds to the baselines. Summing this array along the last dimension should
-#'   produce practically the same result as `predcontrib = TRUE`.
-#'   In the multiclass case, a list of `num_class` such arrays.
-#'
-#' When `strict_shape = TRUE`, the output is always an array:
-#' - For normal predictions, the output has dimension `(num_class, nrow(newdata))`.
-#' - For `predcontrib = TRUE`, the dimension is `(ncol(newdata) + 1, num_class, nrow(newdata))`.
-#' - For `predinteraction = TRUE`, the dimension is `(ncol(newdata) + 1, ncol(newdata) + 1, num_class, nrow(newdata))`.
-#' - For `predleaf = TRUE`, the dimension is `(n_trees_in_forest, num_class, n_iterations, nrow(newdata))`.
+#' A numeric vector or array, with corresponding dimensions depending on the prediction mode and on
+#' parameter `strict_shape` as follows:
+#'
+#' If passing `strict_shape=FALSE`:\itemize{
+#' \item For regression or binary classification: a vector of length `nrows`.
+#' \item For multi-class and multi-target objectives: a matrix of dimensions `[nrows, ngroups]`.
+#'
+#' Note that objective variant `multi:softmax` defaults towards predicting most likely class (a vector
+#' `nrows`) instead of per-class probabilities.
+#' \item For `predleaf`: a matrix with one column per tree.
+#'
+#' For multi-class / multi-target, they will be arranged so that columns in the output will have
+#' the leafs from one group followed by leafs of the other group (e.g. order will be `group1:feat1`,
+#' `group1:feat2`, ..., `group2:feat1`, `group2:feat2`, ...).
+#' \item For `predcontrib`: when not multi-class / multi-target, a matrix with dimensions
+#' `[nrows, nfeats+1]`. The last "+ 1" column corresponds to the baseline value.
+#'
+#' For multi-class and multi-target objectives, will be an array with dimensions `[nrows, ngroups, nfeats+1]`.
+#'
+#' The contribution values are on the scale of untransformed margin (e.g., for binary classification,
+#' the values are log-odds deviations from the baseline).
+#' \item For `predinteraction`: when not multi-class / multi-target, the output is a 3D array of
+#' dimensions `[nrows, nfeats+1, nfeats+1]`. The off-diagonal (in the last two dimensions)
+#' elements represent different feature interaction contributions. The array is symmetric w.r.t. the last
+#' two dimensions. The "+ 1" columns corresponds to the baselines. Summing this array along the last
+#' dimension should produce practically the same result as `predcontrib = TRUE`.
+#'
+#' For multi-class and multi-target, will be a 4D array with dimensions `[nrows, ngroups, nfeats+1, nfeats+1]`
+#' }
+#'
+#' If passing `strict_shape=FALSE`, the result is always an array:\itemize{
+#' \item For normal predictions, the dimension is `[nrows, ngroups]`.
+#' \item For `predcontrib=TRUE`, the dimension is `[nrows, ngroups, nfeats+1]`.
+#' \item For `predinteraction=TRUE`, the dimension is `[nrows, ngroups, nfeats+1, nfeats+1]`.
+#' \item For `predleaf=TRUE`, the dimension is `[nrows, niter, ngroups, num_parallel_tree]`.
+#' }
+#'
+#' If passing `avoid_transpose=TRUE`, then the dimensions in all cases will be in reverse order - for
+#' example, for `predinteraction`, they will be `[nfeats+1, nfeats+1, ngroups, nrows]`
+#' instead of `[nrows, ngroups, nfeats+1, nfeats+1]`.
 #' @seealso [xgb.train()]
 #' @references
 #' 1. Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions",
@@ -279,8 +310,6 @@ xgb.get.handle <- function(object) {
 #' # predict for softmax returns num_class probability numbers per case:
 #' pred <- predict(bst, as.matrix(iris[, -5]))
 #' str(pred)
-#' # reshape it to a num_class-columns matrix
-#' pred <- matrix(pred, ncol = num_class, byrow = TRUE)
 #' # convert the probabilities to softmax labels
 #' pred_labels <- max.col(pred) - 1
 #' # the following should result in the same error as seen in the last iteration
@@ -311,8 +340,11 @@ xgb.get.handle <- function(object) {
 #' @export
 predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE,
                                 predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
-                                reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE,
+                                training = FALSE, iterationrange = NULL, strict_shape = FALSE, avoid_transpose = FALSE,
                                 validate_features = FALSE, base_margin = NULL, ...) {
+  if (NROW(list(...))) {
+    warning("Passed unused prediction arguments: ", paste(names(list(...)), collapse = ", "), ".")
+  }
   if (validate_features) {
     newdata <- validate.features(object, newdata)
   }
@@ -415,10 +447,9 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
     return(val)
   }
 
-  ## We set strict_shape to TRUE then drop the dimensions conditionally
   args <- list(
     training = box(training),
-    strict_shape = box(TRUE),
+    strict_shape = as.logical(strict_shape),
     iteration_begin = box(as.integer(iterationrange[1])),
     iteration_end = box(as.integer(iterationrange[2])),
     type = box(as.integer(0))
@@ -445,96 +476,36 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
 
   json_conf <- jsonlite::toJSON(args, auto_unbox = TRUE)
   if (is_dmatrix) {
-    predts <- .Call(
+    arr <- .Call(
       XGBoosterPredictFromDMatrix_R, xgb.get.handle(object), newdata, json_conf
     )
   } else if (use_as_dense_matrix) {
-    predts <- .Call(
+    arr <- .Call(
       XGBoosterPredictFromDense_R, xgb.get.handle(object), newdata, missing, json_conf, base_margin
     )
   } else if (use_as_csr_matrix) {
-    predts <- .Call(
+    arr <- .Call(
       XGBoosterPredictFromCSR_R, xgb.get.handle(object), csr_data, missing, json_conf, base_margin
     )
   } else if (use_as_df) {
-    predts <- .Call(
+    arr <- .Call(
       XGBoosterPredictFromColumnar_R, xgb.get.handle(object), newdata, missing, json_conf, base_margin
     )
   }
 
-  names(predts) <- c("shape", "results")
-  shape <- predts$shape
-  arr <- predts$results
-
-  n_ret <- length(arr)
-  if (n_row != shape[1]) {
-    stop("Incorrect predict shape.")
-  }
-
-  .Call(XGSetArrayDimInplace_R, arr, rev(shape))
-
-  cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "(Intercept)") else NULL
-  n_groups <- shape[2]
-
   ## Needed regardless of whether strict shape is being used.
-  if (predcontrib) {
-    .Call(XGSetArrayDimNamesInplace_R, arr, list(cnames, NULL, NULL))
-  } else if (predinteraction) {
-    .Call(XGSetArrayDimNamesInplace_R, arr, list(cnames, cnames, NULL, NULL))
-  }
-  if (strict_shape) {
-    return(arr) # strict shape is calculated by libxgboost uniformly.
+  if ((predcontrib || predinteraction) && !is.null(colnames(newdata))) {
+    cnames <- c(colnames(newdata), "(Intercept)")
+    dim_names <- vector(mode = "list", length = length(dim(arr)))
+    dim_names[[1L]] <- cnames
+    if (predinteraction) dim_names[[2L]] <- cnames
+    .Call(XGSetArrayDimNamesInplace_R, arr, dim_names)
   }
 
-  if (predleaf) {
-    ## Predict leaf
-    if (n_ret == n_row) {
-      .Call(XGSetArrayDimInplace_R, arr, c(n_row, 1L))
-    } else {
-      arr <- matrix(arr, nrow = n_row, byrow = TRUE)
-    }
-  } else if (predcontrib) {
-    ## Predict contribution
-    arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
-    if (n_ret == n_row) {
-      .Call(XGSetArrayDimInplace_R, arr, c(n_row, 1L))
-      .Call(XGSetArrayDimNamesInplace_R, arr, list(NULL, cnames))
-    } else if (n_groups != 1) {
-      ## turns array into list of matrices
-      arr <- lapply(seq_len(n_groups), function(g) arr[g, , ])
-    } else {
-      ## remove the first axis (group)
-      newdim <- dim(arr)[2:3]
-      newdn <- dimnames(arr)[2:3]
-      arr <- arr[1, , ]
-      .Call(XGSetArrayDimInplace_R, arr, newdim)
-      .Call(XGSetArrayDimNamesInplace_R, arr, newdn)
-    }
-  } else if (predinteraction) {
-    ## Predict interaction
-    arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
-    if (n_ret == n_row) {
-      .Call(XGSetArrayDimInplace_R, arr, c(n_row, 1L))
-      .Call(XGSetArrayDimNamesInplace_R, arr, list(NULL, cnames))
-    } else if (n_groups != 1) {
-      ## turns array into list of matrices
-      arr <- lapply(seq_len(n_groups), function(g) arr[g, , , ])
-    } else {
-      ## remove the first axis (group)
-      arr <- arr[1, , , , drop = FALSE]
-      newdim <- dim(arr)[2:4]
-      newdn <- dimnames(arr)[2:4]
-      .Call(XGSetArrayDimInplace_R, arr, newdim)
-      .Call(XGSetArrayDimNamesInplace_R, arr, newdn)
-    }
-  } else {
-    ## Normal prediction
-    if (reshape && n_groups != 1) {
-      arr <- matrix(arr, ncol = n_groups, byrow = TRUE)
-    } else {
-      .Call(XGSetArrayDimInplace_R, arr, NULL)
-    }
+  if (!avoid_transpose && is.array(arr)) {
+    arr <- aperm(arr)
   }
+
   return(arr)
 }
 
diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R
index be3f7116034c..20e8f3f4322e 100644
--- a/R-package/R/xgb.plot.shap.R
+++ b/R-package/R/xgb.plot.shap.R
@@ -294,8 +294,10 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
   if (is.null(features) && (is.null(model) || !inherits(model, "xgb.Booster")))
     stop("when features are not provided, one must provide an xgb.Booster model to rank the features")
 
+  last_dim <- function(v) dim(v)[length(dim(v))]
+
   if (!is.null(shap_contrib) &&
-      (!is.matrix(shap_contrib) || nrow(shap_contrib) != nrow(data) || ncol(shap_contrib) != ncol(data) + 1))
+      (!is.array(shap_contrib) || nrow(shap_contrib) != nrow(data) || last_dim(shap_contrib) != ncol(data) + 1))
     stop("shap_contrib is not compatible with the provided data")
 
   if (is.character(features) && is.null(colnames(data)))
@@ -318,19 +320,39 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
     colnames(data) <- paste0("X", seq_len(ncol(data)))
   }
 
-  if (!is.null(shap_contrib)) {
-    if (is.list(shap_contrib)) { # multiclass: either choose a class or merge
-      shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]] else Reduce("+", lapply(shap_contrib, abs))
-    }
-    shap_contrib <- shap_contrib[idx, ]
-    if (is.null(colnames(shap_contrib))) {
-      colnames(shap_contrib) <- paste0("X", seq_len(ncol(data)))
-    }
-  } else {
-    shap_contrib <- predict(model, newdata = data, predcontrib = TRUE, approxcontrib = approxcontrib)
-    if (is.list(shap_contrib)) { # multiclass: either choose a class or merge
-      shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]] else Reduce("+", lapply(shap_contrib, abs))
+  reshape_3d_shap_contrib <- function(shap_contrib, target_class) {
+    # multiclass: either choose a class or merge
+    if (is.list(shap_contrib)) {
+      if (!is.null(target_class)) {
+        shap_contrib <- shap_contrib[[target_class + 1]]
+      } else {
+        shap_contrib <- Reduce("+", lapply(shap_contrib, abs))
+      }
+    } else if (length(dim(shap_contrib)) > 2) {
+      if (!is.null(target_class)) {
+        orig_shape <- dim(shap_contrib)
+        shap_contrib <- shap_contrib[, target_class + 1, , drop = TRUE]
+        if (!is.matrix(shap_contrib)) {
+          shap_contrib <- matrix(shap_contrib, orig_shape[c(1L, 3L)])
+        }
+      } else {
+        shap_contrib <- apply(abs(shap_contrib), c(1L, 3L), sum)
+      }
     }
+    return(shap_contrib)
+  }
+
+  if (is.null(shap_contrib)) {
+    shap_contrib <- predict(
+      model,
+      newdata = data,
+      predcontrib = TRUE,
+      approxcontrib = approxcontrib
+    )
+  }
+  shap_contrib <- reshape_3d_shap_contrib(shap_contrib, target_class)
+  if (is.null(colnames(shap_contrib))) {
+    colnames(shap_contrib) <- paste0("X", seq_len(ncol(data)))
   }
 
   if (is.null(features)) {
diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd
index 9c2e434d0625..b7a2effee9f7 100644
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -13,10 +13,10 @@
   predcontrib = FALSE,
   approxcontrib = FALSE,
   predinteraction = FALSE,
-  reshape = FALSE,
   training = FALSE,
   iterationrange = NULL,
   strict_shape = FALSE,
+  avoid_transpose = FALSE,
   validate_features = FALSE,
   base_margin = NULL,
   ...
@@ -66,10 +66,6 @@ logistic regression would return log-odds instead of probabilities.}
 
 \item{predinteraction}{Whether to return contributions of feature interactions to individual predictions (see Details).}
 
-\item{reshape}{Whether to reshape the vector of predictions to matrix form when there are several
-prediction outputs per case. No effect if \code{predleaf}, \code{predcontrib},
-or \code{predinteraction} is \code{TRUE}.}
-
 \item{training}{Whether the prediction result is used for training. For dart booster,
 training predicting will perform dropout.}
 
@@ -86,8 +82,27 @@ base-1 indexing, and inclusive of both ends).
    If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
 }\if{html}{\out{</div>}}}
 
-\item{strict_shape}{Default is \code{FALSE}. When set to \code{TRUE}, the output
-type and shape of predictions are invariant to the model type.}
+\item{strict_shape}{Whether to always return an array with the same dimensions for the given prediction mode
+regardless of the model type - meaning that, for example, both a multi-class and a binary classification
+model would generate output arrays with the same number of dimensions, with the 'class' dimension having
+size equal to '1' for the binary model.
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   If passing `FALSE` (the default), dimensions will be simplified according to the model type, so that a
+   binary classification model for example would not have a redundant dimension for 'class'.
+
+   See documentation for the return type for the exact shape of the output arrays for each prediction mode.
+}\if{html}{\out{</div>}}}
+
+\item{avoid_transpose}{Whether to output the resulting predictions in the same memory layout in which they
+are generated by the core XGBoost library, without transposing them to match the expected output shape.
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   Internally, XGBoost uses row-major order for the predictions it generates, while R arrays use column-major
+   order, hence the result needs to be transposed in order to have the expected shape when represented as
+   an R array or matrix, which might be a slow operation.
+
+   If passing `TRUE`, then the result will have dimensions in reverse order - for example, rows
+   will be the last dimensions instead of the first dimension.
+}\if{html}{\out{</div>}}}
 
 \item{validate_features}{When \code{TRUE}, validate that the Booster's and newdata's feature_names
 match (only applicable when both \code{object} and \code{newdata} have feature names).
@@ -116,32 +131,46 @@ match (only applicable when both \code{object} and \code{newdata} have feature n
 \item{...}{Not used.}
 }
 \value{
-The return type depends on \code{strict_shape}. If \code{FALSE} (default):
-\itemize{
-\item For regression or binary classification: A vector of length \code{nrows(newdata)}.
-\item For multiclass classification: A vector of length \code{num_class * nrows(newdata)} or
-a \verb{(nrows(newdata), num_class)} matrix, depending on the \code{reshape} value.
-\item When \code{predleaf = TRUE}: A matrix with one column per tree.
-\item When \code{predcontrib = TRUE}: When not multiclass, a matrix with
-\code{ num_features + 1} columns. The last "+ 1" column corresponds to the baseline value.
-In the multiclass case, a list of \code{num_class} such matrices.
-The contribution values are on the scale of untransformed margin
-(e.g., for binary classification, the values are log-odds deviations from the baseline).
-\item When \code{predinteraction = TRUE}: When not multiclass, the output is a 3d array of
-dimension \code{c(nrow, num_features + 1, num_features + 1)}. The off-diagonal (in the last two dimensions)
-elements represent different feature interaction contributions. The array is symmetric WRT the last
-two dimensions. The "+ 1" columns corresponds to the baselines. Summing this array along the last dimension should
-produce practically the same result as \code{predcontrib = TRUE}.
-In the multiclass case, a list of \code{num_class} such arrays.
+A numeric vector or array, with corresponding dimensions depending on the prediction mode and on
+parameter \code{strict_shape} as follows:
+
+If passing \code{strict_shape=FALSE}:\itemize{
+\item For regression or binary classification: a vector of length \code{nrows}.
+\item For multi-class and multi-target objectives: a matrix of dimensions \verb{[nrows, ngroups]}.
+
+Note that objective variant \code{multi:softmax} defaults towards predicting most likely class (a vector
+\code{nrows}) instead of per-class probabilities.
+\item For \code{predleaf}: a matrix with one column per tree.
+
+For multi-class / multi-target, they will be arranged so that columns in the output will have
+the leafs from one group followed by leafs of the other group (e.g. order will be \code{group1:feat1},
+\code{group1:feat2}, ..., \code{group2:feat1}, \code{group2:feat2}, ...).
+\item For \code{predcontrib}: when not multi-class / multi-target, a matrix with dimensions
+\verb{[nrows, nfeats+1]}. The last "+ 1" column corresponds to the baseline value.
+
+For multi-class and multi-target objectives, will be an array with dimensions \verb{[nrows, ngroups, nfeats+1]}.
+
+The contribution values are on the scale of untransformed margin (e.g., for binary classification,
+the values are log-odds deviations from the baseline).
+\item For \code{predinteraction}: when not multi-class / multi-target, the output is a 3D array of
+dimensions \verb{[nrows, nfeats+1, nfeats+1]}. The off-diagonal (in the last two dimensions)
+elements represent different feature interaction contributions. The array is symmetric w.r.t. the last
+two dimensions. The "+ 1" columns corresponds to the baselines. Summing this array along the last
+dimension should produce practically the same result as \code{predcontrib = TRUE}.
+
+For multi-class and multi-target, will be a 4D array with dimensions \verb{[nrows, ngroups, nfeats+1, nfeats+1]}
 }
 
-When \code{strict_shape = TRUE}, the output is always an array:
-\itemize{
-\item For normal predictions, the output has dimension \verb{(num_class, nrow(newdata))}.
-\item For \code{predcontrib = TRUE}, the dimension is \verb{(ncol(newdata) + 1, num_class, nrow(newdata))}.
-\item For \code{predinteraction = TRUE}, the dimension is \verb{(ncol(newdata) + 1, ncol(newdata) + 1, num_class, nrow(newdata))}.
-\item For \code{predleaf = TRUE}, the dimension is \verb{(n_trees_in_forest, num_class, n_iterations, nrow(newdata))}.
+If passing \code{strict_shape=FALSE}, the result is always an array:\itemize{
+\item For normal predictions, the dimension is \verb{[nrows, ngroups]}.
+\item For \code{predcontrib=TRUE}, the dimension is \verb{[nrows, ngroups, nfeats+1]}.
+\item For \code{predinteraction=TRUE}, the dimension is \verb{[nrows, ngroups, nfeats+1, nfeats+1]}.
+\item For \code{predleaf=TRUE}, the dimension is \verb{[nrows, niter, ngroups, num_parallel_tree]}.
 }
+
+If passing \code{avoid_transpose=TRUE}, then the dimensions in all cases will be in reverse order - for
+example, for \code{predinteraction}, they will be \verb{[nfeats+1, nfeats+1, ngroups, nrows]}
+instead of \verb{[nrows, ngroups, nfeats+1, nfeats+1]}.
 }
 \description{
 Predict values on data based on xgboost model.
@@ -241,8 +270,6 @@ bst <- xgb.train(
 # predict for softmax returns num_class probability numbers per case:
 pred <- predict(bst, as.matrix(iris[, -5]))
 str(pred)
-# reshape it to a num_class-columns matrix
-pred <- matrix(pred, ncol = num_class, byrow = TRUE)
 # convert the probabilities to softmax labels
 pred_labels <- max.col(pred) - 1
 # the following should result in the same error as seen in the last iteration
diff --git a/R-package/src/init.c b/R-package/src/init.c
index 5db3218b4e1b..16c1d3b14189 100644
--- a/R-package/src/init.c
+++ b/R-package/src/init.c
@@ -45,7 +45,6 @@ extern SEXP XGBoosterSetAttr_R(SEXP, SEXP, SEXP);
 extern SEXP XGBoosterSetParam_R(SEXP, SEXP, SEXP);
 extern SEXP XGBoosterUpdateOneIter_R(SEXP, SEXP, SEXP);
 extern SEXP XGCheckNullPtr_R(SEXP);
-extern SEXP XGSetArrayDimInplace_R(SEXP, SEXP);
 extern SEXP XGSetArrayDimNamesInplace_R(SEXP, SEXP);
 extern SEXP XGDMatrixCreateFromCSC_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixCreateFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
@@ -108,7 +107,6 @@ static const R_CallMethodDef CallEntries[] = {
   {"XGBoosterSetParam_R",         (DL_FUNC) &XGBoosterSetParam_R,         3},
   {"XGBoosterUpdateOneIter_R",    (DL_FUNC) &XGBoosterUpdateOneIter_R,    3},
   {"XGCheckNullPtr_R",            (DL_FUNC) &XGCheckNullPtr_R,            1},
-  {"XGSetArrayDimInplace_R",      (DL_FUNC) &XGSetArrayDimInplace_R,      2},
   {"XGSetArrayDimNamesInplace_R", (DL_FUNC) &XGSetArrayDimNamesInplace_R, 2},
   {"XGDMatrixCreateFromCSC_R",    (DL_FUNC) &XGDMatrixCreateFromCSC_R,    6},
   {"XGDMatrixCreateFromCSR_R",    (DL_FUNC) &XGDMatrixCreateFromCSR_R,    6},
diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index 989d3bc0d48e..5faae8a9fda0 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -330,11 +330,6 @@ XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle) {
   return Rf_ScalarLogical(R_ExternalPtrAddr(handle) == nullptr);
 }
 
-XGB_DLL SEXP XGSetArrayDimInplace_R(SEXP arr, SEXP dims) {
-  Rf_setAttrib(arr, R_DimSymbol, dims);
-  return R_NilValue;
-}
-
 XGB_DLL SEXP XGSetArrayDimNamesInplace_R(SEXP arr, SEXP dim_names) {
   Rf_setAttrib(arr, R_DimNamesSymbol, dim_names);
   return R_NilValue;
@@ -1301,12 +1296,9 @@ enum class PredictionInputType {DMatrix, DenseMatrix, CSRMatrix, DataFrame};
 SEXP XGBoosterPredictGeneric(SEXP handle, SEXP input_data, SEXP json_config,
                                     PredictionInputType input_type, SEXP missing,
                                     SEXP base_margin) {
-  SEXP r_out_shape;
-  SEXP r_out_result;
-  SEXP r_out = Rf_protect(Rf_allocVector(VECSXP, 2));
-  SEXP json_config_ = Rf_protect(Rf_asChar(json_config));
-
+  SEXP r_out_result = R_NilValue;
   R_API_BEGIN();
+  SEXP json_config_ = Rf_protect(Rf_asChar(json_config));
   char const *c_json_config = CHAR(json_config_);
 
   bst_ulong out_dim;
@@ -1386,23 +1378,24 @@ SEXP XGBoosterPredictGeneric(SEXP handle, SEXP input_data, SEXP json_config,
   }
   CHECK_CALL(res_code);
 
-  r_out_shape = Rf_protect(Rf_allocVector(INTSXP, out_dim));
+  SEXP r_out_shape = Rf_protect(Rf_allocVector(INTSXP, out_dim));
   size_t len = 1;
   int *r_out_shape_ = INTEGER(r_out_shape);
   for (size_t i = 0; i < out_dim; ++i) {
-    r_out_shape_[i] = out_shape[i];
+    r_out_shape_[out_dim - i - 1] = out_shape[i];
     len *= out_shape[i];
   }
   r_out_result = Rf_protect(Rf_allocVector(REALSXP, len));
   std::copy(out_result, out_result + len, REAL(r_out_result));
 
-  SET_VECTOR_ELT(r_out, 0, r_out_shape);
-  SET_VECTOR_ELT(r_out, 1, r_out_result);
+  if (out_dim > 1) {
+    Rf_setAttrib(r_out_result, R_DimSymbol, r_out_shape);
+  }
 
   R_API_END();
-  Rf_unprotect(4);
+  Rf_unprotect(3);
 
-  return r_out;
+  return r_out_result;
 }
 
 }  // namespace
diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h
index 7f6554cb6961..08f16bac109c 100644
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -26,14 +26,6 @@
  */
 XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle);
 
-/*!
- * \brief set the dimensions of an array in-place
- * \param arr
- * \param dims dimensions to set to the array
- * \return NULL value
- */
-XGB_DLL SEXP XGSetArrayDimInplace_R(SEXP arr, SEXP dims);
-
 /*!
  * \brief set the names of the dimensions of an array in-place
  * \param arr
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index bbb8fb323478..f0ebd7a1c9b5 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -162,20 +162,20 @@ test_that("train and predict softprob", {
   pred <- predict(bst, as.matrix(iris[, -5]))
   expect_length(pred, nrow(iris) * 3)
   # row sums add up to total probability of 1:
-  expect_equal(rowSums(matrix(pred, ncol = 3, byrow = TRUE)), rep(1, nrow(iris)), tolerance = 1e-7)
+  expect_equal(rowSums(pred), rep(1, nrow(iris)), tolerance = 1e-7)
   # manually calculate error at the last iteration:
-  mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE)
-  expect_equal(as.numeric(t(mpred)), pred)
+  mpred <- predict(bst, as.matrix(iris[, -5]))
+  expect_equal(mpred, pred)
   pred_labels <- max.col(mpred) - 1
   err <- sum(pred_labels != lb) / length(lb)
   expect_equal(attributes(bst)$evaluation_log[5, train_merror], err, tolerance = 5e-6)
   # manually calculate error at the 1st iteration:
-  mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 1))
+  mpred <- predict(bst, as.matrix(iris[, -5]), iterationrange = c(1, 1))
   pred_labels <- max.col(mpred) - 1
   err <- sum(pred_labels != lb) / length(lb)
   expect_equal(attributes(bst)$evaluation_log[1, train_merror], err, tolerance = 5e-6)
 
-  mpred1 <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 1))
+  mpred1 <- predict(bst, as.matrix(iris[, -5]), iterationrange = c(1, 1))
   expect_equal(mpred, mpred1)
 
   d <- cbind(
@@ -190,7 +190,7 @@ test_that("train and predict softprob", {
     data = dtrain, nrounds = 4, num_class = 10,
     objective = "multi:softprob"
   )
-  predt <- predict(booster, as.matrix(d), reshape = TRUE, strict_shape = FALSE)
+  predt <- predict(booster, as.matrix(d), strict_shape = FALSE)
   expect_equal(ncol(predt), 10)
   expect_equal(rowSums(predt), rep(1, 100), tolerance = 1e-7)
 })
@@ -254,13 +254,13 @@ test_that("train and predict RF with softprob", {
   )
   expect_equal(xgb.get.num.boosted.rounds(bst), 15)
   # predict for all iterations:
-  pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE)
+  pred <- predict(bst, as.matrix(iris[, -5]))
   expect_equal(dim(pred), c(nrow(iris), 3))
   pred_labels <- max.col(pred) - 1
   err <- sum(pred_labels != lb) / length(lb)
   expect_equal(attributes(bst)$evaluation_log[nrounds, train_merror], err, tolerance = 5e-6)
   # predict for 7 iterations and adjust for 4 parallel trees per iteration
-  pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 7))
+  pred <- predict(bst, as.matrix(iris[, -5]), iterationrange = c(1, 7))
   err <- sum((max.col(pred) - 1) != lb) / length(lb)
   expect_equal(attributes(bst)$evaluation_log[7, train_merror], err, tolerance = 5e-6)
 })
@@ -485,15 +485,25 @@ test_that("strict_shape works", {
     n_rows <- nrow(X)
     n_cols <- ncol(X)
 
-    expect_equal(dim(predt), c(n_groups, n_rows))
-    expect_equal(dim(margin), c(n_groups, n_rows))
-    expect_equal(dim(contri), c(n_cols + 1, n_groups, n_rows))
-    expect_equal(dim(interact), c(n_cols + 1, n_cols + 1, n_groups, n_rows))
-    expect_equal(dim(leaf), c(1, n_groups, n_rounds, n_rows))
+    expect_equal(dim(predt), c(n_rows, n_groups))
+    expect_equal(dim(margin), c(n_rows, n_groups))
+    expect_equal(dim(contri), c(n_rows, n_groups, n_cols + 1))
+    expect_equal(dim(interact), c(n_rows, n_groups, n_cols + 1, n_cols + 1))
+    expect_equal(dim(leaf), c(n_rows, n_rounds, n_groups, 1))
 
     if (n_groups != 1) {
       for (g in seq_len(n_groups)) {
-        expect_lt(max(abs(colSums(contri[, g, ]) - margin[g, ])), 1e-5)
+        expect_lt(max(abs(rowSums(contri[, g, ]) - margin[, g])), 1e-5)
+      }
+
+      leaf_no_strict <- predict(bst, X, strict_shape = FALSE, predleaf = TRUE)
+      for (g in seq_len(n_groups)) {
+        g_mask <- rep(FALSE, n_groups)
+        g_mask[g] <- TRUE
+        expect_equal(
+          leaf[, , g, 1L],
+          leaf_no_strict[, g_mask]
+        )
       }
     }
   }
@@ -562,7 +572,7 @@ test_that("Quantile regression accepts multiple quantiles", {
     ),
     nrounds = 15
   )
-  pred <- predict(model, x, reshape = TRUE)
+  pred <- predict(model, x)
 
   expect_equal(dim(pred)[1], nrow(x))
   expect_equal(dim(pred)[2], 3)
@@ -590,7 +600,7 @@ test_that("Can use multi-output labels with built-in objectives", {
     data = dm,
     nrounds = 5
   )
-  pred <- predict(model, x, reshape = TRUE)
+  pred <- predict(model, x)
   expect_equal(pred[, 1], -pred[, 2])
   expect_true(cor(y, pred[, 1]) > 0.9)
   expect_true(cor(y, pred[, 2]) < -0.9)
@@ -619,7 +629,7 @@ test_that("Can use multi-output labels with custom objectives", {
     data = dm,
     nrounds = 5
   )
-  pred <- predict(model, x, reshape = TRUE)
+  pred <- predict(model, x)
   expect_equal(pred[, 1], -pred[, 2])
   expect_true(cor(y, pred[, 1]) > 0.9)
   expect_true(cor(y, pred[, 2]) < -0.9)
@@ -666,8 +676,8 @@ test_that("Can predict on data.frame objects", {
     nrounds = 5
   )
 
-  pred_mat <- predict(model, xgb.DMatrix(x_mat), nthread = n_threads)
-  pred_df <- predict(model, x_df, nthread = n_threads)
+  pred_mat <- predict(model, xgb.DMatrix(x_mat))
+  pred_df <- predict(model, x_df)
   expect_equal(pred_mat, pred_df)
 })
 
@@ -737,7 +747,7 @@ test_that("Coefficients from gblinear have the expected shape and names", {
   expect_equal(nrow(coefs), ncol(x) + 1)
   expect_equal(ncol(coefs), 3)
   expect_equal(row.names(coefs), c("(Intercept)", colnames(x)))
-  pred_auto <- predict(model, x, outputmargin = TRUE, reshape = TRUE)
+  pred_auto <- predict(model, x, outputmargin = TRUE)
   pred_manual <- unname(mm %*% coefs)
   expect_equal(pred_manual, pred_auto, tolerance = 1e-7)
 })
diff --git a/R-package/tests/testthat/test_booster_slicing.R b/R-package/tests/testthat/test_booster_slicing.R
index 711ccd8b6ae9..f80968e06476 100644
--- a/R-package/tests/testthat/test_booster_slicing.R
+++ b/R-package/tests/testthat/test_booster_slicing.R
@@ -9,7 +9,7 @@ model <- xgb.train(
   data = dm,
   nrounds = 20
 )
-pred <- predict(model, dm, predleaf = TRUE, reshape = TRUE)
+pred <- predict(model, dm, predleaf = TRUE)
 
 test_that("Slicing full model", {
   new_model <- xgb.slice.Booster(model, 1, 0)
@@ -24,32 +24,32 @@ test_that("Slicing full model", {
 
 test_that("Slicing sequence from start", {
   new_model <- xgb.slice.Booster(model, 1, 10)
-  new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
+  new_pred <- predict(new_model, dm, predleaf = TRUE)
   expect_equal(new_pred, pred[, seq(1, 10)])
 
   new_model <- model[1:10]
-  new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
+  new_pred <- predict(new_model, dm, predleaf = TRUE)
   expect_equal(new_pred, pred[, seq(1, 10)])
 })
 
 test_that("Slicing sequence from middle", {
   new_model <- xgb.slice.Booster(model, 5, 10)
-  new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
+  new_pred <- predict(new_model, dm, predleaf = TRUE)
   expect_equal(new_pred, pred[, seq(5, 10)])
 
   new_model <- model[5:10]
-  new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
+  new_pred <- predict(new_model, dm, predleaf = TRUE)
   expect_equal(new_pred, pred[, seq(5, 10)])
 })
 
 test_that("Slicing with non-unit step", {
   for (s in 2:5) {
     new_model <- xgb.slice.Booster(model, 1, 17, s)
-    new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
+    new_pred <- predict(new_model, dm, predleaf = TRUE)
     expect_equal(new_pred, pred[, seq(1, 17, s)])
 
     new_model <- model[seq(1, 17, s)]
-    new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
+    new_pred <- predict(new_model, dm, predleaf = TRUE)
     expect_equal(new_pred, pred[, seq(1, 17, s)])
   }
 })
@@ -57,11 +57,11 @@ test_that("Slicing with non-unit step", {
 test_that("Slicing with non-unit step from middle", {
   for (s in 2:5) {
     new_model <- xgb.slice.Booster(model, 4, 17, s)
-    new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
+    new_pred <- predict(new_model, dm, predleaf = TRUE)
     expect_equal(new_pred, pred[, seq(4, 17, s)])
 
     new_model <- model[seq(4, 17, s)]
-    new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
+    new_pred <- predict(new_model, dm, predleaf = TRUE)
     expect_equal(new_pred, pred[, seq(4, 17, s)])
   }
 })
diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R
index 548afece378c..cca7b88da5a9 100644
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -400,12 +400,10 @@ test_that("xgb.DMatrix: can take multi-dimensional 'base_margin'", {
     ),
     nround = 1
   )
-  pred_only_x <- predict(model, x, nthread = n_threads, reshape = TRUE)
+  pred_only_x <- predict(model, x)
   pred_w_base <- predict(
     model,
-    xgb.DMatrix(data = x, base_margin = b, nthread = n_threads),
-    nthread = n_threads,
-    reshape = TRUE
+    xgb.DMatrix(data = x, base_margin = b)
   )
   expect_equal(pred_only_x, pred_w_base - b, tolerance = 1e-5)
 })
diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R
index 16831cf38c26..7724d6bc5da6 100644
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -132,31 +132,31 @@ test_that("predict feature contributions works", {
                tolerance = float_tolerance)
 
   # gbtree multiclass
-  pred <- predict(mbst.Tree, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE)
+  pred <- predict(mbst.Tree, as.matrix(iris[, -5]), outputmargin = TRUE)
   pred_contr <- predict(mbst.Tree, as.matrix(iris[, -5]), predcontrib = TRUE)
-  expect_is(pred_contr, "list")
-  expect_length(pred_contr, 3)
-  for (g in seq_along(pred_contr)) {
-    expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "(Intercept)"))
-    expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), 1e-5)
+  expect_is(pred_contr, "array")
+  expect_length(dim(pred_contr), 3)
+  for (g in seq_len(dim(pred_contr)[2])) {
+    expect_equal(colnames(pred_contr[, g, ]), c(colnames(iris[, -5]), "(Intercept)"))
+    expect_lt(max(abs(rowSums(pred_contr[, g, ]) - pred[, g])), 1e-5)
   }
 
   # gblinear multiclass (set base_score = 0, which is base margin in multiclass)
-  pred <- predict(mbst.GLM, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE)
+  pred <- predict(mbst.GLM, as.matrix(iris[, -5]), outputmargin = TRUE)
   pred_contr <- predict(mbst.GLM, as.matrix(iris[, -5]), predcontrib = TRUE)
-  expect_length(pred_contr, 3)
+  expect_length(dim(pred_contr), 3)
   coefs_all <- matrix(
     data = as.numeric(xgb.dump(mbst.GLM)[-c(1, 2, 6)]),
     ncol = 3,
     byrow = TRUE
   )
-  for (g in seq_along(pred_contr)) {
-    expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "(Intercept)"))
-    expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), float_tolerance)
+  for (g in seq_along(dim(pred_contr)[2])) {
+    expect_equal(colnames(pred_contr[, g, ]), c(colnames(iris[, -5]), "(Intercept)"))
+    expect_lt(max(abs(rowSums(pred_contr[, g, ]) - pred[, g])), float_tolerance)
     # manual calculation of linear terms
     coefs <- c(coefs_all[-1, g], coefs_all[1, g]) # intercept needs to be the last
     pred_contr_manual <- sweep(as.matrix(cbind(iris[, -5], 1)), 2, coefs, FUN = "*")
-    expect_equal(as.numeric(pred_contr[[g]]), as.numeric(pred_contr_manual),
+    expect_equal(as.numeric(pred_contr[, g, ]), as.numeric(pred_contr_manual),
                  tolerance = float_tolerance)
   }
 })
diff --git a/R-package/tests/testthat/test_interactions.R b/R-package/tests/testthat/test_interactions.R
index 60cf9d80039a..1380225c79f7 100644
--- a/R-package/tests/testthat/test_interactions.R
+++ b/R-package/tests/testthat/test_interactions.R
@@ -127,41 +127,23 @@ test_that("multiclass feature interactions work", {
     eta = 0.1, max_depth = 4, objective = 'multi:softprob', num_class = 3, nthread = n_threads
   )
   b <- xgb.train(param, dm, 40)
-  pred <- t(
-    array(
-      data = predict(b, dm, outputmargin = TRUE),
-      dim = c(3, 150)
-    )
-  )
+  pred <- predict(b, dm, outputmargin = TRUE)
 
   # SHAP contributions:
   cont <- predict(b, dm, predcontrib = TRUE)
-  expect_length(cont, 3)
-  # rewrap them as a 3d array
-  cont <- array(
-    data = unlist(cont),
-    dim = c(150, 5,  3)
-  )
+  expect_length(dim(cont), 3)
 
   # make sure for each row they add up to marginal predictions
-  expect_lt(max(abs(apply(cont, c(1, 3), sum) - pred)), 0.001)
+  expect_lt(max(abs(apply(cont, c(1, 2), sum) - pred)), 0.001)
 
   # SHAP interaction contributions:
   intr <- predict(b, dm, predinteraction = TRUE)
-  expect_length(intr, 3)
-  # rewrap them as a 4d array
-  intr <- aperm(
-    a = array(
-      data = unlist(intr),
-      dim = c(150, 5, 5, 3)
-    ),
-    perm = c(4, 1, 2, 3)  # [grp, row, col, col]
-  )
+  expect_length(dim(intr), 4)
 
   # check the symmetry
   expect_lt(max(abs(aperm(intr, c(1, 2, 4, 3)) - intr)), 0.00001)
   # sums WRT columns must be close to feature contributions
-  expect_lt(max(abs(apply(intr, c(1, 2, 3), sum) - aperm(cont, c(3, 1, 2)))), 0.00001)
+  expect_lt(max(abs(apply(intr, c(1, 2, 3), sum) - cont)), 0.00001)
 })
 
 

From 5db0803eb2fe874c431786153729c57a54463990 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 19 Aug 2024 03:50:37 -0500
Subject: [PATCH 09/19] ignore UBJSON files in gitignore (#10718)

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index a53ce7fd09d9..4a780c305d1e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -84,6 +84,7 @@ target
 *.gcov
 *.gcda
 *.gcno
+*.ubj
 build_tests
 /tests/cpp/xgboost_test
 

From b949a4bf7b4ea528a47052d534f8134ff77e9bbc Mon Sep 17 00:00:00 2001
From: Michael Mayer <mayermichael79@gmail.com>
Date: Tue, 20 Aug 2024 07:33:13 +0200
Subject: [PATCH 10/19] [R] Work on Roxygen documentation (#10674)

---
 R-package/R/callbacks.R                       | 597 ++++++++++--------
 R-package/R/utils.R                           |  87 +--
 R-package/R/xgb.Booster.R                     | 298 +++++----
 R-package/R/xgb.DMatrix.R                     |  14 +-
 R-package/R/xgb.create.features.R             |  34 +-
 R-package/R/xgb.dump.R                        |  56 +-
 R-package/R/xgb.ggplot.R                      |   5 +-
 R-package/R/xgb.importance.R                  |  26 +-
 R-package/R/xgb.load.R                        |  28 +-
 R-package/R/xgb.load.raw.R                    |   6 +-
 R-package/R/xgb.model.dt.tree.R               |  21 +-
 R-package/R/xgb.plot.deepness.R               |   3 +-
 R-package/R/xgb.plot.importance.R             |  35 +-
 R-package/R/xgb.plot.multi.trees.R            |   8 +-
 R-package/R/xgb.plot.shap.R                   |  30 +-
 R-package/R/xgb.plot.tree.R                   |  56 +-
 R-package/R/xgb.save.R                        |  56 +-
 R-package/R/xgb.save.raw.R                    |  37 +-
 R-package/R/xgboost.R                         |   1 +
 .../a-compatibility-note-for-saveRDS-save.Rd  |  79 ++-
 R-package/man/coef.xgb.Booster.Rd             |  12 +-
 R-package/man/predict.xgb.Booster.Rd          | 120 ++--
 R-package/man/print.xgb.Booster.Rd            |   1 -
 R-package/man/variable.names.xgb.Booster.Rd   |   4 +-
 R-package/man/xgb.Callback.Rd                 |  63 +-
 R-package/man/xgb.DMatrix.Rd                  |   8 +-
 R-package/man/xgb.DataBatch.Rd                |   8 +-
 R-package/man/xgb.attr.Rd                     |  16 +-
 R-package/man/xgb.cb.cv.predict.Rd            |   8 +-
 R-package/man/xgb.cb.early.stop.Rd            |   6 +-
 R-package/man/xgb.cb.evaluation.log.Rd        |   4 +-
 R-package/man/xgb.cb.gblinear.history.Rd      | 113 +++-
 R-package/man/xgb.cb.print.evaluation.Rd      |   6 +-
 R-package/man/xgb.cb.reset.parameters.Rd      |   8 +-
 R-package/man/xgb.cb.save.model.Rd            |  15 +-
 R-package/man/xgb.config.Rd                   |   8 +-
 R-package/man/xgb.copy.Booster.Rd             |   6 +-
 R-package/man/xgb.create.features.Rd          |  24 +-
 R-package/man/xgb.dump.Rd                     |  46 +-
 R-package/man/xgb.gblinear.history.Rd         |  22 +-
 R-package/man/xgb.get.num.boosted.rounds.Rd   |   4 +-
 R-package/man/xgb.is.same.Booster.Rd          |  15 +-
 R-package/man/xgb.load.Rd                     |  25 +-
 R-package/man/xgb.load.raw.Rd                 |   6 +-
 R-package/man/xgb.model.dt.tree.Rd            |   9 +-
 R-package/man/xgb.parameters.Rd               |   9 +-
 R-package/man/xgb.plot.deepness.Rd            |   3 +-
 R-package/man/xgb.plot.importance.Rd          |   4 +-
 R-package/man/xgb.plot.multi.trees.Rd         |   6 +-
 R-package/man/xgb.plot.shap.Rd                |   8 +-
 R-package/man/xgb.plot.shap.summary.Rd        |   7 +-
 R-package/man/xgb.plot.tree.Rd                |  12 +-
 R-package/man/xgb.save.Rd                     |  42 +-
 R-package/man/xgb.save.raw.Rd                 |  35 +-
 R-package/man/xgb.slice.Booster.Rd            |   7 +-
 55 files changed, 1174 insertions(+), 993 deletions(-)

diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R
index a00678a2f4dd..e26eb64b5268 100644
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@@ -1,172 +1,166 @@
 .reserved_cb_names <- c("names", "class", "call", "params", "niter", "nfeatures", "folds")
 
-#' @title XGBoost Callback Constructor
-#' @description Constructor for defining the structure of callback functions that can be executed
+#' XGBoost Callback Constructor
+#'
+#' Constructor for defining the structure of callback functions that can be executed
 #' at different stages of model training (before / after training, before / after each boosting
 #' iteration).
+#'
+#' @details
+#' Arguments that will be passed to the supplied functions are as follows:
+#' - env The same environment that is passed under argument `env`.
+#'
+#'   It may be modified by the functions in order to e.g. keep tracking of what happens
+#'   across iterations or similar.
+#'
+#'   This environment is only used by the functions supplied to the callback, and will
+#'   not be kept after the model fitting function terminates (see parameter `f_after_training`).
+#'
+#' - model The booster object when using [xgb.train()], or the folds when using [xgb.cv()].
+#'
+#'   For [xgb.cv()], folds are a list with a structure as follows:
+#'     - `dtrain`: The training data for the fold (as an `xgb.DMatrix` object).
+#'     - `bst`: Rhe `xgb.Booster` object for the fold.
+#'     - `evals`: A list containing two DMatrices, with names `train` and `test`
+#'       (`test` is the held-out data for the fold).
+#'     - `index`: The indices of the hold-out data for that fold (base-1 indexing),
+#'       from which the `test` entry in `evals` was obtained.
+#'
+#'   This object should **not** be in-place modified in ways that conflict with the
+#'   training (e.g. resetting the parameters for a training update in a way that resets
+#'   the number of rounds to zero in order to overwrite rounds).
+#'
+#'   Note that any R attributes that are assigned to the booster during the callback functions,
+#'   will not be kept thereafter as the booster object variable is not re-assigned during
+#'   training. It is however possible to set C-level attributes of the booster through
+#'   [xgb.attr()] or [xgb.attributes()], which should remain available for the rest
+#'   of the iterations and after the training is done.
+#'
+#'   For keeping variables across iterations, it's recommended to use `env` instead.
+#' - data The data to which the model is being fit, as an `xgb.DMatrix` object.
+#'
+#'   Note that, for [xgb.cv()], this will be the full data, while data for the specific
+#'   folds can be found in the `model` object.
+#' - evals The evaluation data, as passed under argument `evals` to [xgb.train()].
+#'
+#'   For [xgb.cv()], this will always be `NULL`.
+#' - begin_iteration Index of the first boosting iteration that will be executed (base-1 indexing).
+#'
+#'   This will typically be '1', but when using training continuation, depending on the
+#'   parameters for updates, boosting rounds will be continued from where the previous
+#'   model ended, in which case this will be larger than 1.
+#'
+#' - end_iteration Index of the last boostign iteration that will be executed
+#'   (base-1 indexing, inclusive of this end).
+#'
+#'   It should match with argument `nrounds` passed to [xgb.train()] or [xgb.cv()].
+#'
+#'   Note that boosting might be interrupted before reaching this last iteration, for
+#'   example by using the early stopping callback \link{xgb.cb.early.stop}.
+#' - iteration Index of the iteration number that is being executed (first iteration
+#'   will be the same as parameter `begin_iteration`, then next one will add +1, and so on).
+#'
+#' - iter_feval Evaluation metrics for `evals` that were supplied, either
+#'   determined by the objective, or by parameter `feval`.
+#'
+#'   For [xgb.train()], this will be a named vector with one entry per element in
+#'   `evals`, where the names are determined as 'evals name' + '-' + 'metric name' - for
+#'   example, if `evals` contains an entry named "tr" and the metric is "rmse",
+#'   this will be a one-element vector with name "tr-rmse".
+#'
+#'   For [xgb.cv()], this will be a 2d matrix with dimensions `[length(evals), nfolds]`,
+#'   where the row names will follow the same naming logic as the one-dimensional vector
+#'   that is passed in [xgb.train()].
+#'
+#'   Note that, internally, the built-in callbacks such as [xgb.cb.print.evaluation] summarize
+#'   this table by calculating the row-wise means and standard deviations.
+#'
+#' - final_feval The evaluation results after the last boosting round is executed
+#'   (same format as `iter_feval`, and will be the exact same input as passed under
+#'   `iter_feval` to the last round that is executed during model fitting).
+#'
+#' - prev_cb_res Result from a previous run of a callback sharing the same name
+#'   (as given by parameter `cb_name`) when conducting training continuation, if there
+#'   was any in the booster R attributes.
+#'
+#'   Sometimes, one might want to append the new results to the previous one, and this will
+#'   be done automatically by the built-in callbacks such as [xgb.cb.evaluation.log],
+#'   which will append the new rows to the previous table.
+#'
+#'   If no such previous callback result is available (which it never will when fitting
+#'   a model from start instead of updating an existing model), this will be `NULL`.
+#'
+#'   For [xgb.cv()], which doesn't support training continuation, this will always be `NULL`.
+#'
+#' The following names (`cb_name` values) are reserved for internal callbacks:
+#' - print_evaluation
+#' - evaluation_log
+#' - reset_parameters
+#' - early_stop
+#' - save_model
+#' - cv_predict
+#' - gblinear_history
+#'
+#' The following names are reserved for other non-callback attributes:
+#' - names
+#' - class
+#' - call
+#' - params
+#' - niter
+#' - nfeatures
+#' - folds
+#'
+#' When using the built-in early stopping callback ([xgb.cb.early.stop]), said callback
+#' will always be executed before the others, as it sets some booster C-level attributes
+#' that other callbacks might also use. Otherwise, the order of execution will match with
+#' the order in which the callbacks are passed to the model fitting function.
+#'
 #' @param cb_name Name for the callback.
 #'
-#' If the callback produces some non-NULL result (from executing the function passed under
-#' `f_after_training`), that result will be added as an R attribute to the resulting booster
-#' (or as a named element in the result of CV), with the attribute name specified here.
+#'   If the callback produces some non-NULL result (from executing the function passed under
+#'   `f_after_training`), that result will be added as an R attribute to the resulting booster
+#'   (or as a named element in the result of CV), with the attribute name specified here.
 #'
-#' Names of callbacks must be unique - i.e. there cannot be two callbacks with the same name.
+#'   Names of callbacks must be unique - i.e. there cannot be two callbacks with the same name.
 #' @param env An environment object that will be passed to the different functions in the callback.
-#' Note that this environment will not be shared with other callbacks.
+#'   Note that this environment will not be shared with other callbacks.
 #' @param f_before_training A function that will be executed before the training has started.
 #'
-#' If passing `NULL` for this or for the other function inputs, then no function will be executed.
+#'   If passing `NULL` for this or for the other function inputs, then no function will be executed.
 #'
-#' If passing a function, it will be called with parameters supplied as non-named arguments
-#' matching the function signatures that are shown in the default value for each function argument.
+#'   If passing a function, it will be called with parameters supplied as non-named arguments
+#'   matching the function signatures that are shown in the default value for each function argument.
 #' @param f_before_iter A function that will be executed before each boosting round.
 #'
-#' This function can signal whether the training should be finalized or not, by outputting
-#' a value that evaluates to `TRUE` - i.e. if the output from the function provided here at
-#' a given round is `TRUE`, then training will be stopped before the current iteration happens.
+#'   This function can signal whether the training should be finalized or not, by outputting
+#'   a value that evaluates to `TRUE` - i.e. if the output from the function provided here at
+#'   a given round is `TRUE`, then training will be stopped before the current iteration happens.
 #'
-#' Return values of `NULL` will be interpreted as `FALSE`.
+#'   Return values of `NULL` will be interpreted as `FALSE`.
 #' @param f_after_iter A function that will be executed after each boosting round.
 #'
-#' This function can signal whether the training should be finalized or not, by outputting
-#' a value that evaluates to `TRUE` - i.e. if the output from the function provided here at
-#' a given round is `TRUE`, then training will be stopped at that round.
+#'   This function can signal whether the training should be finalized or not, by outputting
+#'   a value that evaluates to `TRUE` - i.e. if the output from the function provided here at
+#'   a given round is `TRUE`, then training will be stopped at that round.
 #'
-#' Return values of `NULL` will be interpreted as `FALSE`.
+#'   Return values of `NULL` will be interpreted as `FALSE`.
 #' @param f_after_training A function that will be executed after training is finished.
 #'
-#' This function can optionally output something non-NULL, which will become part of the R
-#' attributes of the booster (assuming one passes `keep_extra_attributes=TRUE` to \link{xgb.train})
-#' under the name supplied for parameter `cb_name` imn the case of \link{xgb.train}; or a part
-#' of the named elements in the result of \link{xgb.cv}.
-#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
-#' @details Arguments that will be passed to the supplied functions are as follows:\itemize{
-#'
-#' \item env The same environment that is passed under argument `env`.
-#'
-#' It may be modified by the functions in order to e.g. keep tracking of what happens
-#' across iterations or similar.
-#'
-#' This environment is only used by the functions supplied to the callback, and will
-#' not be kept after the model fitting function terminates (see parameter `f_after_training`).
-#'
-#' \item model The booster object when using \link{xgb.train}, or the folds when using
-#' \link{xgb.cv}.
-#'
-#' For \link{xgb.cv}, folds are a list with a structure as follows:\itemize{
-#' \item `dtrain`: The training data for the fold (as an `xgb.DMatrix` object).
-#' \item `bst`: Rhe `xgb.Booster` object for the fold.
-#' \item `evals`: A list containing two DMatrices, with names `train` and `test`
-#' (`test` is the held-out data for the fold).
-#' \item `index`: The indices of the hold-out data for that fold (base-1 indexing),
-#' from which the `test` entry in `evals` was obtained.
-#' }
-#'
-#' This object should \bold{not} be in-place modified in ways that conflict with the
-#' training (e.g. resetting the parameters for a training update in a way that resets
-#' the number of rounds to zero in order to overwrite rounds).
-#'
-#' Note that any R attributes that are assigned to the booster during the callback functions,
-#' will not be kept thereafter as the booster object variable is not re-assigned during
-#' training. It is however possible to set C-level attributes of the booster through
-#' \link{xgb.attr} or \link{xgb.attributes}, which should remain available for the rest
-#' of the iterations and after the training is done.
-#'
-#' For keeping variables across iterations, it's recommended to use `env` instead.
-#' \item data The data to which the model is being fit, as an `xgb.DMatrix` object.
-#'
-#' Note that, for \link{xgb.cv}, this will be the full data, while data for the specific
-#' folds can be found in the `model` object.
-#'
-#' \item evals The evaluation data, as passed under argument `evals` to
-#' \link{xgb.train}.
-#'
-#' For \link{xgb.cv}, this will always be `NULL`.
-#'
-#' \item begin_iteration Index of the first boosting iteration that will be executed
-#' (base-1 indexing).
-#'
-#' This will typically be '1', but when using training continuation, depending on the
-#' parameters for updates, boosting rounds will be continued from where the previous
-#' model ended, in which case this will be larger than 1.
-#'
-#' \item end_iteration Index of the last boostign iteration that will be executed
-#' (base-1 indexing, inclusive of this end).
-#'
-#' It should match with argument `nrounds` passed to \link{xgb.train} or \link{xgb.cv}.
-#'
-#' Note that boosting might be interrupted before reaching this last iteration, for
-#' example by using the early stopping callback \link{xgb.cb.early.stop}.
-#'
-#' \item iteration Index of the iteration number that is being executed (first iteration
-#' will be the same as parameter `begin_iteration`, then next one will add +1, and so on).
-#'
-#' \item iter_feval Evaluation metrics for `evals` that were supplied, either
-#' determined by the objective, or by parameter `feval`.
-#'
-#' For \link{xgb.train}, this will be a named vector with one entry per element in
-#' `evals`, where the names are determined as 'evals name' + '-' + 'metric name' - for
-#' example, if `evals` contains an entry named "tr" and the metric is "rmse",
-#' this will be a one-element vector with name "tr-rmse".
-#'
-#' For \link{xgb.cv}, this will be a 2d matrix with dimensions `[length(evals), nfolds]`,
-#' where the row names will follow the same naming logic as the one-dimensional vector
-#' that is passed in \link{xgb.train}.
-#'
-#' Note that, internally, the built-in callbacks such as \link{xgb.cb.print.evaluation} summarize
-#' this table by calculating the row-wise means and standard deviations.
-#'
-#' \item final_feval The evaluation results after the last boosting round is executed
-#' (same format as `iter_feval`, and will be the exact same input as passed under
-#' `iter_feval` to the last round that is executed during model fitting).
-#'
-#' \item prev_cb_res Result from a previous run of a callback sharing the same name
-#' (as given by parameter `cb_name`) when conducting training continuation, if there
-#' was any in the booster R attributes.
-#'
-#' Some times, one might want to append the new results to the previous one, and this will
-#' be done automatically by the built-in callbacks such as \link{xgb.cb.evaluation.log},
-#' which will append the new rows to the previous table.
-#'
-#' If no such previous callback result is available (which it never will when fitting
-#' a model from start instead of updating an existing model), this will be `NULL`.
-#'
-#' For \link{xgb.cv}, which doesn't support training continuation, this will always be `NULL`.
-#' }
-#'
-#' The following names (`cb_name` values) are reserved for internal callbacks:\itemize{
-#' \item print_evaluation
-#' \item evaluation_log
-#' \item reset_parameters
-#' \item early_stop
-#' \item save_model
-#' \item cv_predict
-#' \item gblinear_history
-#' }
-#'
-#' The following names are reserved for other non-callback attributes:\itemize{
-#' \item names
-#' \item class
-#' \item call
-#' \item params
-#' \item niter
-#' \item nfeatures
-#' \item folds
-#' }
-#'
-#' When using the built-in early stopping callback (\link{xgb.cb.early.stop}), said callback
-#' will always be executed before the others, as it sets some booster C-level attributes
-#' that other callbacks might also use. Otherwise, the order of execution will match with
-#' the order in which the callbacks are passed to the model fitting function.
-#' @seealso Built-in callbacks:\itemize{
-#' \item \link{xgb.cb.print.evaluation}
-#' \item \link{xgb.cb.evaluation.log}
-#' \item \link{xgb.cb.reset.parameters}
-#' \item \link{xgb.cb.early.stop}
-#' \item \link{xgb.cb.save.model}
-#' \item \link{xgb.cb.cv.predict}
-#' \item \link{xgb.cb.gblinear.history}
-#' }
+#'   This function can optionally output something non-NULL, which will become part of the R
+#'   attributes of the booster (assuming one passes `keep_extra_attributes=TRUE` to [xgb.train()])
+#'   under the name supplied for parameter `cb_name` imn the case of [xgb.train()]; or a part
+#'   of the named elements in the result of [xgb.cv()].
+#' @return An `xgb.Callback` object, which can be passed to [xgb.train()] or [xgb.cv()].
+#'
+#' @seealso Built-in callbacks:
+#' - [xgb.cb.print.evaluation]
+#' - [xgb.cb.evaluation.log]
+#' - [xgb.cb.reset.parameters]
+#' - [xgb.cb.early.stop]
+#' - [xgb.cb.save.model]
+#' - [xgb.cb.cv.predict]
+#' - [xgb.cb.gblinear.history]
+#
 #' @examples
 #' # Example constructing a custom callback that calculates
 #' # squared error on the training data (no separate test set),
@@ -203,8 +197,10 @@
 #' )
 #'
 #' data(mtcars)
+#'
 #' y <- mtcars$mpg
 #' x <- as.matrix(mtcars[, -1])
+#'
 #' dm <- xgb.DMatrix(x, label = y, nthread = 1)
 #' model <- xgb.train(
 #'   data = dm,
@@ -407,16 +403,18 @@ xgb.Callback <- function(
   return(paste0(iter, res))
 }
 
-#' @title Callback for printing the result of evaluation
-#' @param period results would be printed every number of periods
-#' @param showsd whether standard deviations should be printed (when available)
-#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' Callback for printing the result of evaluation
+#'
 #' @description
-#' The callback function prints the result of evaluation at every \code{period} iterations.
+#' The callback function prints the result of evaluation at every `period` iterations.
 #' The initial and the last iteration's evaluations are always printed.
 #'
-#' Does not leave any attribute in the booster (see \link{xgb.cb.evaluation.log} for that).
-#' @seealso \link{xgb.Callback}
+#' Does not leave any attribute in the booster (see [xgb.cb.evaluation.log] for that).
+#'
+#' @param period Results would be printed every number of periods.
+#' @param showsd Whether standard deviations should be printed (when available).
+#' @return An `xgb.Callback` object, which can be passed to [xgb.train()] or [xgb.cv()].
+#' @seealso [xgb.Callback]
 #' @export
 xgb.cb.print.evaluation <- function(period = 1, showsd = TRUE) {
   if (length(period) != 1 || period != floor(period) || period < 1) {
@@ -450,14 +448,16 @@ xgb.cb.print.evaluation <- function(period = 1, showsd = TRUE) {
   )
 }
 
-#' @title Callback for logging the evaluation history
-#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' Callback for logging the evaluation history
+#'
 #' @details This callback creates a table with per-iteration evaluation metrics (see parameters
-#' `evals` and `feval` in \link{xgb.train}).
-#' @details
+#' `evals` and `feval` in [xgb.train()]).
+#'
 #' Note: in the column names of the final data.table, the dash '-' character is replaced with
 #' the underscore '_' in order to make the column names more like regular R identifiers.
-#' @seealso \link{xgb.cb.print.evaluation}
+#'
+#' @return An `xgb.Callback` object, which can be passed to [xgb.train()] or [xgb.cv()].
+#' @seealso [xgb.cb.print.evaluation]
 #' @export
 xgb.cb.evaluation.log <- function() {
   xgb.Callback(
@@ -517,20 +517,22 @@ xgb.cb.evaluation.log <- function() {
   )
 }
 
-#' @title Callback for resetting the booster's parameters at each iteration.
-#' @param new_params a list where each element corresponds to a parameter that needs to be reset.
-#'        Each element's value must be either a vector of values of length \code{nrounds}
-#'        to be set at each iteration,
-#'        or a function of two parameters \code{learning_rates(iteration, nrounds)}
-#'        which returns a new parameter value by using the current iteration number
-#'        and the total number of boosting rounds.
-#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' Callback for resetting booster parameters at each iteration
+#'
 #' @details
 #' Note that when training is resumed from some previous model, and a function is used to
-#' reset a parameter value, the \code{nrounds} argument in this function would be the
+#' reset a parameter value, the `nrounds` argument in this function would be the
 #' the number of boosting rounds in the current training.
 #'
 #' Does not leave any attribute in the booster.
+#'
+#' @param new_params List of parameters needed to be reset.
+#'   Each element's value must be either a vector of values of length `nrounds`
+#'   to be set at each iteration,
+#'   or a function of two parameters `learning_rates(iteration, nrounds)`
+#'   which returns a new parameter value by using the current iteration number
+#'   and the total number of boosting rounds.
+#' @return An `xgb.Callback` object, which can be passed to [xgb.train()] or [xgb.cv()].
 #' @export
 xgb.cb.reset.parameters <- function(new_params) {
   stopifnot(is.list(new_params))
@@ -583,39 +585,39 @@ xgb.cb.reset.parameters <- function(new_params) {
   )
 }
 
-#' @title Callback to activate early stopping
-#' @param stopping_rounds The number of rounds with no improvement in
-#'        the evaluation metric in order to stop the training.
-#' @param maximize Whether to maximize the evaluation metric.
-#' @param metric_name The name of an evaluation column to use as a criteria for early
-#'        stopping. If not set, the last column would be used.
-#'        Let's say the test data in \code{evals} was labelled as \code{dtest},
-#'        and one wants to use the AUC in test data for early stopping regardless of where
-#'        it is in the \code{evals}, then one of the following would need to be set:
-#'        \code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
-#'        All dash '-' characters in metric names are considered equivalent to '_'.
-#' @param verbose Whether to print the early stopping information.
-#' @param keep_all_iter Whether to keep all of the boosting rounds that were produced
-#'        in the resulting object. If passing `FALSE`, will only keep the boosting rounds
-#'        up to the detected best iteration, discarding the ones that come after.
-#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' Callback to activate early stopping
+#'
 #' @description
 #' This callback function determines the condition for early stopping.
 #'
 #' The following attributes are assigned to the booster's object:
-#' \itemize{
-#' \item \code{best_score} the evaluation score at the best iteration
-#' \item \code{best_iteration} at which boosting iteration the best score has occurred
+#' - `best_score` the evaluation score at the best iteration
+#' - `best_iteration` at which boosting iteration the best score has occurred
 #' (0-based index for interoperability of binary models)
-#' }
 #'
 #' The same values are also stored as R attributes as a result of the callback, plus an additional
 #' attribute `stopped_by_max_rounds` which indicates whether an early stopping by the `stopping_rounds`
 #' condition occurred. Note that the `best_iteration` that is stored under R attributes will follow
 #' base-1 indexing, so it will be larger by '1' than the C-level 'best_iteration' that is accessed
-#' through \link{xgb.attr} or \link{xgb.attributes}.
+#' through [xgb.attr()] or  [xgb.attributes()].
 #'
 #' At least one dataset is required in `evals` for early stopping to work.
+#'
+#' @param stopping_rounds The number of rounds with no improvement in
+#'   the evaluation metric in order to stop the training.
+#' @param maximize Whether to maximize the evaluation metric.
+#' @param metric_name The name of an evaluation column to use as a criteria for early
+#'   stopping. If not set, the last column would be used.
+#'   Let's say the test data in `evals` was labelled as `dtest`,
+#'   and one wants to use the AUC in test data for early stopping regardless of where
+#'   it is in the `evals`, then one of the following would need to be set:
+#'   `metric_name = 'dtest-auc'` or `metric_name = 'dtest_auc'`.
+#'   All dash '-' characters in metric names are considered equivalent to '_'.
+#' @param verbose Whether to print the early stopping information.
+#' @param keep_all_iter Whether to keep all of the boosting rounds that were produced
+#'   in the resulting object. If passing `FALSE`, will only keep the boosting rounds
+#'   up to the detected best iteration, discarding the ones that come after.
+#' @return An `xgb.Callback` object, which can be passed to [xgb.train()] or [xgb.cv()].
 #' @export
 xgb.cb.early.stop <- function(
   stopping_rounds,
@@ -771,21 +773,22 @@ xgb.cb.early.stop <- function(
   xgb.save(model, save_name)
 }
 
-#' @title Callback for saving a model file.
-#' @param save_period Save the model to disk after every
-#'        \code{save_period} iterations; 0 means save the model at the end.
-#' @param save_name The name or path for the saved model file.
-#'        It can contain a \code{\link[base]{sprintf}} formatting specifier
-#'        to include the integer iteration number in the file name.
-#'        E.g., with \code{save_name} = 'xgboost_%04d.model',
-#'        the file saved at iteration 50 would be named "xgboost_0050.model".
-#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train},
-#'         but \bold{not} to \link{xgb.cv}.
+#' Callback for saving a model file
+#'
 #' @description
 #' This callback function allows to save an xgb-model file, either periodically
-#' after each \code{save_period}'s or at the end.
+#' after each `save_period`'s or at the end.
 #'
 #' Does not leave any attribute in the booster.
+#'
+#' @param save_period Save the model to disk after every `save_period` iterations;
+#'   0 means save the model at the end.
+#' @param save_name The name or path for the saved model file.
+#'   It can contain a [sprintf()] formatting specifier to include the integer
+#'   iteration number in the file name. E.g., with `save_name = 'xgboost_%04d.model'`,
+#'   the file saved at iteration 50 would be named "xgboost_0050.model".
+#' @return An `xgb.Callback` object, which can be passed to [xgb.train()],
+#'   but **not** to [xgb.cv()].
 #' @export
 xgb.cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") {
   if (save_period < 0) {
@@ -817,24 +820,26 @@ xgb.cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") {
   )
 }
 
-#' @title Callback for returning cross-validation based predictions.
-#' @param save_models A flag for whether to save the folds' models.
-#' @param outputmargin Whether to save margin predictions (same effect as passing this
-#' parameter to \link{predict.xgb.Booster}).
-#' @return An `xgb.Callback` object, which can be passed to \link{xgb.cv},
-#'         but \bold{not} to \link{xgb.train}.
-#' @description
+#' Callback for returning cross-validation based predictions
+#'
 #' This callback function saves predictions for all of the test folds,
 #' and also allows to save the folds' models.
+#'
 #' @details
-#' Predictions are saved inside of the \code{pred} element, which is either a vector or a matrix,
+#' Predictions are saved inside of the `pred` element, which is either a vector or a matrix,
 #' depending on the number of prediction outputs per data row. The order of predictions corresponds
-#' to the order of rows in the original dataset. Note that when a custom \code{folds} list is
-#' provided in \code{xgb.cv}, the predictions would only be returned properly when this list is a
+#' to the order of rows in the original dataset. Note that when a custom `folds` list is
+#' provided in [xgb.cv()], the predictions would only be returned properly when this list is a
 #' non-overlapping list of k sets of indices, as in a standard k-fold CV. The predictions would not be
 #' meaningful when user-provided folds have overlapping indices as in, e.g., random sampling splits.
-#' When some of the indices in the training dataset are not included into user-provided \code{folds},
-#' their prediction value would be \code{NA}.
+#' When some of the indices in the training dataset are not included into user-provided `folds`,
+#' their prediction value would be `NA`.
+#'
+#' @param save_models A flag for whether to save the folds' models.
+#' @param outputmargin Whether to save margin predictions (same effect as passing this
+#'   parameter to [predict.xgb.Booster]).
+#' @return An `xgb.Callback` object, which can be passed to [xgb.cv()],
+#'   but **not** to [xgb.train()].
 #' @export
 xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) {
   xgb.Callback(
@@ -903,19 +908,15 @@ xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) {
   return(coefs)
 }
 
-#' @title Callback for collecting coefficients history of a gblinear booster
-#' @param sparse when set to `FALSE`/`TRUE`, a dense/sparse matrix is used to store the result.
-#'       Sparse format is useful when one expects only a subset of coefficients to be non-zero,
-#'       when using the "thrifty" feature selector with fairly small number of top features
-#'       selected per iteration.
-#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' Callback for collecting coefficients history of a gblinear booster
+#'
 #' @details
 #' To keep things fast and simple, gblinear booster does not internally store the history of linear
 #' model coefficients at each boosting iteration. This callback provides a workaround for storing
 #' the coefficients' path, by extracting them after each training iteration.
 #'
 #' This callback will construct a matrix where rows are boosting iterations and columns are
-#' feature coefficients (same order as when calling \link{coef.xgb.Booster}, with the intercept
+#' feature coefficients (same order as when calling [coef.xgb.Booster], with the intercept
 #' corresponding to the first column).
 #'
 #' When there is more than one coefficient per feature (e.g. multi-class classification),
@@ -928,13 +929,18 @@ xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) {
 #' one coefficient per feature) the names will be composed as 'column name' + ':' + 'class index'
 #' (so e.g. column 'c1' for class '0' will be named 'c1:0').
 #'
-#' With \code{xgb.train}, the output is either a dense or a sparse matrix.
-#' With with \code{xgb.cv}, it is a list (one element per each fold) of such
-#' matrices.
+#' With [xgb.train()], the output is either a dense or a sparse matrix.
+#' With with [xgb.cv()], it is a list (one element per each fold) of such matrices.
 #'
-#' Function \link{xgb.gblinear.history} function provides an easy way to retrieve the
+#' Function [xgb.gblinear.history] provides an easy way to retrieve the
 #' outputs from this callback.
-#' @seealso \link{xgb.gblinear.history}, \link{coef.xgb.Booster}.
+#'
+#' @param sparse When set to `FALSE`/`TRUE`, a dense/sparse matrix is used to store the result.
+#'   Sparse format is useful when one expects only a subset of coefficients to be non-zero,
+#'   when using the "thrifty" feature selector with fairly small number of top features
+#'   selected per iteration.
+#' @return An `xgb.Callback` object, which can be passed to [xgb.train()] or [xgb.cv()].
+#' @seealso [xgb.gblinear.history], [coef.xgb.Booster].
 #' @examples
 #' #### Binary classification:
 #'
@@ -944,57 +950,109 @@ xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) {
 #'
 #' # In the iris dataset, it is hard to linearly separate Versicolor class from the rest
 #' # without considering the 2nd order interactions:
-#' x <- model.matrix(Species ~ .^2, iris)[,-1]
+#' x <- model.matrix(Species ~ .^2, iris)[, -1]
 #' colnames(x)
-#' dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = nthread)
-#' param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
-#'               lambda = 0.0003, alpha = 0.0003, nthread = nthread)
+#' dtrain <- xgb.DMatrix(
+#'   scale(x),
+#'   label = 1 * (iris$Species == "versicolor"),
+#'   nthread = nthread
+#' )
+#' param <- list(
+#'   booster = "gblinear",
+#'   objective = "reg:logistic",
+#'   eval_metric = "auc",
+#'   lambda = 0.0003,
+#'   alpha = 0.0003,
+#'   nthread = nthread
+#' )
+#'
 #' # For 'shotgun', which is a default linear updater, using high eta values may result in
 #' # unstable behaviour in some datasets. With this simple dataset, however, the high learning
 #' # rate does not break the convergence, but allows us to illustrate the typical pattern of
 #' # "stochastic explosion" behaviour of this lock-free algorithm at early boosting iterations.
-#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 1.,
-#'                  callbacks = list(xgb.cb.gblinear.history()))
+#' bst <- xgb.train(
+#'   param,
+#'   dtrain,
+#'   list(tr = dtrain),
+#'   nrounds = 200,
+#'   eta = 1.,
+#'   callbacks = list(xgb.cb.gblinear.history())
+#' )
+#'
 #' # Extract the coefficients' path and plot them vs boosting iteration number:
 #' coef_path <- xgb.gblinear.history(bst)
-#' matplot(coef_path, type = 'l')
+#' matplot(coef_path, type = "l")
 #'
 #' # With the deterministic coordinate descent updater, it is safer to use higher learning rates.
 #' # Will try the classical componentwise boosting which selects a single best feature per round:
-#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 0.8,
-#'                  updater = 'coord_descent', feature_selector = 'thrifty', top_k = 1,
-#'                  callbacks = list(xgb.cb.gblinear.history()))
-#' matplot(xgb.gblinear.history(bst), type = 'l')
+#' bst <- xgb.train(
+#'   param,
+#'   dtrain,
+#'   list(tr = dtrain),
+#'   nrounds = 200,
+#'   eta = 0.8,
+#'   updater = "coord_descent",
+#'   feature_selector = "thrifty",
+#'   top_k = 1,
+#'   callbacks = list(xgb.cb.gblinear.history())
+#' )
+#' matplot(xgb.gblinear.history(bst), type = "l")
 #' #  Componentwise boosting is known to have similar effect to Lasso regularization.
 #' # Try experimenting with various values of top_k, eta, nrounds,
 #' # as well as different feature_selectors.
 #'
 #' # For xgb.cv:
-#' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 100, eta = 0.8,
-#'               callbacks = list(xgb.cb.gblinear.history()))
+#' bst <- xgb.cv(
+#'   param,
+#'   dtrain,
+#'   nfold = 5,
+#'   nrounds = 100,
+#'   eta = 0.8,
+#'   callbacks = list(xgb.cb.gblinear.history())
+#' )
 #' # coefficients in the CV fold #3
-#' matplot(xgb.gblinear.history(bst)[[3]], type = 'l')
+#' matplot(xgb.gblinear.history(bst)[[3]], type = "l")
 #'
 #'
 #' #### Multiclass classification:
-#' #
 #' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread)
-#' param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
-#'               lambda = 0.0003, alpha = 0.0003, nthread = nthread)
+#'
+#' param <- list(
+#'   booster = "gblinear",
+#'   objective = "multi:softprob",
+#'   num_class = 3,
+#'   lambda = 0.0003,
+#'   alpha = 0.0003,
+#'   nthread = nthread
+#' )
+#'
 #' # For the default linear updater 'shotgun' it sometimes is helpful
 #' # to use smaller eta to reduce instability
-#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
-#'                  callbacks = list(xgb.cb.gblinear.history()))
+#' bst <- xgb.train(
+#'   param,
+#'   dtrain,
+#'   list(tr = dtrain),
+#'   nrounds = 50,
+#'   eta = 0.5,
+#'   callbacks = list(xgb.cb.gblinear.history())
+#' )
+#'
 #' # Will plot the coefficient paths separately for each class:
-#' matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
-#' matplot(xgb.gblinear.history(bst, class_index = 1), type = 'l')
-#' matplot(xgb.gblinear.history(bst, class_index = 2), type = 'l')
+#' matplot(xgb.gblinear.history(bst, class_index = 0), type = "l")
+#' matplot(xgb.gblinear.history(bst, class_index = 1), type = "l")
+#' matplot(xgb.gblinear.history(bst, class_index = 2), type = "l")
 #'
 #' # CV:
-#' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 70, eta = 0.5,
-#'               callbacks = list(xgb.cb.gblinear.history(FALSE)))
+#' bst <- xgb.cv(
+#'   param,
+#'   dtrain,
+#'   nfold = 5,
+#'   nrounds = 70,
+#'   eta = 0.5,
+#'   callbacks = list(xgb.cb.gblinear.history(FALSE))
+#' )
 #' # 1st fold of 1st class
-#' matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l')
+#' matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = "l")
 #'
 #' @export
 xgb.cb.gblinear.history <- function(sparse = FALSE) {
@@ -1097,28 +1155,31 @@ xgb.cb.gblinear.history <- function(sparse = FALSE) {
   )
 }
 
-#' @title Extract gblinear coefficients history.
-#' @description A helper function to extract the matrix of linear coefficients' history
-#' from a gblinear model created while using the \link{xgb.cb.gblinear.history}
-#' callback (which must be added manually as by default it's not used).
-#' @details Note that this is an R-specific function that relies on R attributes that
-#' are not saved when using xgboost's own serialization functions like \link{xgb.load}
-#' or \link{xgb.load.raw}.
+#' Extract gblinear coefficients history
+#'
+#' A helper function to extract the matrix of linear coefficients' history
+#' from a gblinear model created while using the [xgb.cb.gblinear.history]
+#' callback (which must be added manually as by default it is not used).
+#'
+#' @details
+#' Note that this is an R-specific function that relies on R attributes that
+#' are not saved when using XGBoost's own serialization functions like [xgb.load()]
+#' or [xgb.load.raw()].
 #'
 #' In order for a serialized model to be accepted by this function, one must use R
-#' serializers such as \link{saveRDS}.
-#' @param model either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
-#'        using the \link{xgb.cb.gblinear.history} callback, but \bold{not} a booster
-#'        loaded from \link{xgb.load} or \link{xgb.load.raw}.
+#' serializers such as [saveRDS()].
+#' @param model Either an `xgb.Booster` or a result of [xgb.cv()], trained
+#'   using the [xgb.cb.gblinear.history] callback, but **not** a booster
+#'   loaded from [xgb.load()] or [xgb.load.raw()].
 #' @param class_index zero-based class index to extract the coefficients for only that
-#'        specific class in a multinomial multiclass model. When it is NULL, all the
-#'        coefficients are returned. Has no effect in non-multiclass models.
+#'   specific class in a multinomial multiclass model. When it is `NULL`, all the
+#'   coefficients are returned. Has no effect in non-multiclass models.
 #'
 #' @return
-#' For an \link{xgb.train} result, a matrix (either dense or sparse) with the columns
+#' For an [xgb.train()] result, a matrix (either dense or sparse) with the columns
 #' corresponding to iteration's coefficients and the rows corresponding to boosting iterations.
 #'
-#' For an \link{xgb.cv} result, a list of such matrices is returned with the elements
+#' For an [xgb.cv()] result, a list of such matrices is returned with the elements
 #' corresponding to CV folds.
 #'
 #' When there is more than one coefficient per feature (e.g. multi-class classification)
@@ -1126,7 +1187,7 @@ xgb.cb.gblinear.history <- function(sparse = FALSE) {
 #' the result will be reshaped into a vector where coefficients are arranged first by features and
 #' then by class (e.g. first 1 through N coefficients will be for the first class, then
 #' coefficients N+1 through 2N for the second class, and so on).
-#' @seealso \link{xgb.cb.gblinear.history}, \link{coef.xgb.Booster}.
+#' @seealso [xgb.cb.gblinear.history], [coef.xgb.Booster].
 #' @export
 xgb.gblinear.history <- function(model, class_index = NULL) {
 
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 8b87468a4e8c..46b05c43aa18 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -410,7 +410,7 @@ xgb.createFolds <- function(y, k) {
 #' At this time, some of the parameter names were changed in order to make the code style more uniform.
 #' The deprecated parameters would be removed in the next release.
 #'
-#' To see all the current deprecated and new parameters, check the \code{xgboost:::depr_par_lut} table.
+#' To see all the current deprecated and new parameters, check the `xgboost:::depr_par_lut` table.
 #'
 #' A deprecation warning is shown when any of the deprecated parameters is used in a call.
 #' An additional warning is shown when there was a partial match to a deprecated parameter
@@ -419,70 +419,79 @@ xgb.createFolds <- function(y, k) {
 #' @name xgboost-deprecated
 NULL
 
-#' @title Model Serialization and Compatibility
-#' @description
+#' Model Serialization and Compatibility
 #'
+#' @description
 #' When it comes to serializing XGBoost models, it's possible to use R serializers such as
-#' \link{save} or \link{saveRDS} to serialize an XGBoost R model, but XGBoost also provides
+#' [save()] or [saveRDS()] to serialize an XGBoost R model, but XGBoost also provides
 #' its own serializers with better compatibility guarantees, which allow loading
 #' said models in other language bindings of XGBoost.
 #'
-#' Note that an `xgb.Booster` object, outside of its core components, might also keep:\itemize{
-#' \item Additional model configuration (accessible through \link{xgb.config}),
-#' which includes model fitting parameters like `max_depth` and runtime parameters like `nthread`.
-#' These are not necessarily useful for prediction/importance/plotting.
-#' \item Additional R-specific attributes  - e.g. results of callbacks, such as evaluation logs,
-#' which are kept as a `data.table` object, accessible through `attributes(model)$evaluation_log`
-#' if present.
-#' }
+#' Note that an `xgb.Booster` object, outside of its core components, might also keep:
+#' - Additional model configuration (accessible through [xgb.config()]), which includes
+#'   model fitting parameters like `max_depth` and runtime parameters like `nthread`.
+#'   These are not necessarily useful for prediction/importance/plotting.
+#' - Additional R specific attributes  - e.g. results of callbacks, such as evaluation logs,
+#'   which are kept as a `data.table` object, accessible through
+#'   `attributes(model)$evaluation_log` if present.
 #'
 #' The first one (configurations) does not have the same compatibility guarantees as
-#' the model itself, including attributes that are set and accessed through \link{xgb.attributes} - that is, such configuration
-#' might be lost after loading the booster in a different XGBoost version, regardless of the
-#' serializer that was used. These are saved when using \link{saveRDS}, but will be discarded
-#' if loaded into an incompatible XGBoost version. They are not saved when using XGBoost's
-#' serializers from its public interface including \link{xgb.save} and \link{xgb.save.raw}.
+#' the model itself, including attributes that are set and accessed through
+#' [xgb.attributes()] - that is, such configuration might be lost after loading the
+#' booster in a different XGBoost version, regardless of the serializer that was used.
+#' These are saved when using [saveRDS()], but will be discarded if loaded into an
+#' incompatible XGBoost version. They are not saved when using XGBoost's
+#' serializers from its public interface including [xgb.save()] and [xgb.save.raw()].
 #'
-#' The second ones (R attributes) are not part of the standard XGBoost model structure, and thus are
-#' not saved when using XGBoost's own serializers. These attributes are only used for informational
-#' purposes, such as keeping track of evaluation metrics as the model was fit, or saving the R
-#' call that produced the model, but are otherwise not used for prediction / importance / plotting / etc.
+#' The second ones (R attributes) are not part of the standard XGBoost model structure,
+#' and thus are not saved when using XGBoost's own serializers. These attributes are
+#' only used for informational purposes, such as keeping track of evaluation metrics as
+#' the model was fit, or saving the R call that produced the model, but are otherwise
+#' not used for prediction / importance / plotting / etc.
 #' These R attributes are only preserved when using R's serializers.
 #'
-#' Note that XGBoost models in R starting from version `2.1.0` and onwards, and XGBoost models
-#' before version `2.1.0`; have a very different R object structure and are incompatible with
-#' each other. Hence, models that were saved with R serializers live `saveRDS` or `save` before
-#' version `2.1.0` will not work with latter `xgboost` versions and vice versa. Be aware that
-#' the structure of R model objects could in theory change again in the future, so XGBoost's serializers
+#' Note that XGBoost models in R starting from version `2.1.0` and onwards, and
+#' XGBoost models before version `2.1.0`; have a very different R object structure and
+#' are incompatible with each other. Hence, models that were saved with R serializers
+#' like [saveRDS()] or [save()] before version `2.1.0` will not work with latter
+#' `xgboost` versions and vice versa. Be aware that the structure of R model objects
+#' could in theory change again in the future, so XGBoost's serializers
 #' should be preferred for long-term storage.
 #'
-#' Furthermore, note that using the package `qs` for serialization will require version 0.26 or
-#' higher of said package, and will have the same compatibility restrictions as R serializers.
+#' Furthermore, note that using the package `qs` for serialization will require
+#' version 0.26 or higher of said package, and will have the same compatibility
+#' restrictions as R serializers.
 #'
 #' @details
-#' Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into
+#' Use [xgb.save()] to save the XGBoost model as a stand-alone file. You may opt into
 #' the JSON format by specifying the JSON extension. To read the model back, use
-#' \code{\link{xgb.load}}.
+#' [xgb.load()].
 #'
-#' Use \code{\link{xgb.save.raw}} to save the XGBoost model as a sequence (vector) of raw bytes
+#' Use [xgb.save.raw()] to save the XGBoost model as a sequence (vector) of raw bytes
 #' in a future-proof manner. Future releases of XGBoost will be able to read the raw bytes and
-#' re-construct the corresponding model. To read the model back, use \code{\link{xgb.load.raw}}.
-#' The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model
+#' re-construct the corresponding model. To read the model back, use [xgb.load.raw()].
+#' The [xgb.save.raw()] function is useful if you would like to persist the XGBoost model
 #' as part of another R object.
 #'
-#' Use \link{saveRDS} if you require the R-specific attributes that a booster might have, such
+#' Use [saveRDS()] if you require the R-specific attributes that a booster might have, such
 #' as evaluation logs, but note that future compatibility of such objects is outside XGBoost's
 #' control as it relies on R's serialization format (see e.g. the details section in
-#' \link{serialize} and \link{save} from base R).
+#' [serialize] and [save()] from base R).
 #'
 #' For more details and explanation about model persistence and archival, consult the page
 #' \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.
 #'
 #' @examples
-#' data(agaricus.train, package='xgboost')
-#' bst <- xgb.train(data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
-#'                  max_depth = 2, eta = 1, nthread = 2, nrounds = 2,
-#'                  objective = "binary:logistic")
+#' data(agaricus.train, package = "xgboost")
+#'
+#' bst <- xgb.train(
+#'   data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
+#'   max_depth = 2,
+#'   eta = 1,
+#'   nthread = 2,
+#'   nrounds = 2,
+#'   objective = "binary:logistic"
+#' )
 #'
 #' # Save as a stand-alone file; load it with xgb.load()
 #' fname <- file.path(tempdir(), "xgb_model.ubj")
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index 4b2ba60c388f..0e6313d88e71 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -1,4 +1,4 @@
-# Construct an internal xgboost Booster and get its current number of rounds.
+# Construct an internal XGBoost Booster and get its current number of rounds.
 # internal utility function
 # Note: the number of rounds in the C booster gets reset to zero when changing
 # key booster parameters like 'process_type=update', but in some cases, when
@@ -64,7 +64,7 @@ xgb.get.handle <- function(object) {
   if (inherits(object, "xgb.Booster")) {
     handle <- object$ptr
     if (is.null(handle) || !inherits(handle, "externalptr")) {
-      stop("'xgb.Booster' object is corrupted or is from an incompatible xgboost version.")
+      stop("'xgb.Booster' object is corrupted or is from an incompatible XGBoost version.")
     }
   } else {
     stop("argument must be an 'xgb.Booster' object.")
@@ -77,97 +77,96 @@ xgb.get.handle <- function(object) {
 
 #' Predict method for XGBoost model
 #'
-#' Predict values on data based on xgboost model.
+#' Predict values on data based on XGBoost model.
 #'
 #' @param object Object of class `xgb.Booster`.
 #' @param newdata Takes `data.frame`, `matrix`, `dgCMatrix`, `dgRMatrix`, `dsparseVector`,
-#'        local data file, or `xgb.DMatrix`.
-#'
-#'        For single-row predictions on sparse data, it's recommended to use CSR format. If passing
-#'        a sparse vector, it will take it as a row vector.
-#'
-#'        Note that, for repeated predictions on the same data, one might want to create a DMatrix to
-#'        pass here instead of passing R types like matrices or data frames, as predictions will be
-#'        faster on DMatrix.
-#'
-#'        If `newdata` is a `data.frame`, be aware that:\itemize{
-#'        \item Columns will be converted to numeric if they aren't already, which could potentially make
-#'              the operation slower than in an equivalent `matrix` object.
-#'        \item The order of the columns must match with that of the data from which the model was fitted
-#'              (i.e. columns will not be referenced by their names, just by their order in the data).
-#'        \item If the model was fitted to data with categorical columns, these columns must be of
-#'              `factor` type here, and must use the same encoding (i.e. have the same levels).
-#'        \item If `newdata` contains any `factor` columns, they will be converted to base-0
-#'              encoding (same as during DMatrix creation) - hence, one should not pass a `factor`
-#'              under a column which during training had a different type.
-#'        }
-#' @param missing Float value that represents missing values in data (e.g., 0 or some other extreme value).
-#'
-#'        This parameter is not used when `newdata` is an `xgb.DMatrix` - in such cases, should pass
-#'        this as an argument to the DMatrix constructor instead.
-#' @param outputmargin Whether the prediction should be returned in the form of original untransformed
-#'        sum of predictions from boosting iterations' results. E.g., setting `outputmargin=TRUE` for
-#'        logistic regression would return log-odds instead of probabilities.
+#'   local data file, or `xgb.DMatrix`.
+#'
+#'   For single-row predictions on sparse data, it is recommended to use CSR format. If passing
+#'   a sparse vector, it will take it as a row vector.
+#'
+#'   Note that, for repeated predictions on the same data, one might want to create a DMatrix to
+#'   pass here instead of passing R types like matrices or data frames, as predictions will be
+#'   faster on DMatrix.
+#'
+#'   If `newdata` is a `data.frame`, be aware that:
+#'   - Columns will be converted to numeric if they aren't already, which could potentially make
+#'     the operation slower than in an equivalent `matrix` object.
+#'   - The order of the columns must match with that of the data from which the model was fitted
+#'     (i.e. columns will not be referenced by their names, just by their order in the data).
+#'   - If the model was fitted to data with categorical columns, these columns must be of
+#'     `factor` type here, and must use the same encoding (i.e. have the same levels).
+#'   - If `newdata` contains any `factor` columns, they will be converted to base-0
+#'     encoding (same as during DMatrix creation) - hence, one should not pass a `factor`
+#'     under a column which during training had a different type.
+#' @param missing Float value that represents missing values in data
+#'   (e.g., 0 or some other extreme value).
+#'
+#'   This parameter is not used when `newdata` is an `xgb.DMatrix` - in such cases,
+#'   should pass this as an argument to the DMatrix constructor instead.
+#' @param outputmargin Whether the prediction should be returned in the form of
+#'   original untransformed sum of predictions from boosting iterations' results.
+#'   E.g., setting `outputmargin = TRUE` for logistic regression would return log-odds
+#'   instead of probabilities.
 #' @param predleaf Whether to predict per-tree leaf indices.
 #' @param predcontrib Whether to return feature contributions to individual predictions (see Details).
 #' @param approxcontrib Whether to use a fast approximation for feature contributions (see Details).
 #' @param predinteraction Whether to return contributions of feature interactions to individual predictions (see Details).
 #' @param training Whether the prediction result is used for training. For dart booster,
-#'        training predicting will perform dropout.
+#'   training predicting will perform dropout.
 #' @param iterationrange Sequence of rounds/iterations from the model to use for prediction, specified by passing
-#'        a two-dimensional vector with the start and end numbers in the sequence (same format as R's `seq` - i.e.
-#'        base-1 indexing, and inclusive of both ends).
+#'   a two-dimensional vector with the start and end numbers in the sequence (same format as R's `seq` - i.e.
+#'   base-1 indexing, and inclusive of both ends).
 #'
-#'        For example, passing `c(1,20)` will predict using the first twenty iterations, while passing `c(1,1)` will
-#'        predict using only the first one.
+#'   For example, passing `c(1,20)` will predict using the first twenty iterations, while passing `c(1,1)` will
+#'   predict using only the first one.
 #'
-#'        If passing `NULL`, will either stop at the best iteration if the model used early stopping, or use all
-#'        of the iterations (rounds) otherwise.
+#'   If passing `NULL`, will either stop at the best iteration if the model used early stopping, or use all
+#'   of the iterations (rounds) otherwise.
 #'
-#'        If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
+#'   If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
 #' @param strict_shape Whether to always return an array with the same dimensions for the given prediction mode
-#'        regardless of the model type - meaning that, for example, both a multi-class and a binary classification
-#'        model would generate output arrays with the same number of dimensions, with the 'class' dimension having
-#'        size equal to '1' for the binary model.
+#'   regardless of the model type - meaning that, for example, both a multi-class and a binary classification
+#'   model would generate output arrays with the same number of dimensions, with the 'class' dimension having
+#'   size equal to '1' for the binary model.
 #'
-#'        If passing `FALSE` (the default), dimensions will be simplified according to the model type, so that a
-#'        binary classification model for example would not have a redundant dimension for 'class'.
+#'   If passing `FALSE` (the default), dimensions will be simplified according to the model type, so that a
+#'   binary classification model for example would not have a redundant dimension for 'class'.
 #'
-#'        See documentation for the return type for the exact shape of the output arrays for each prediction mode.
+#'   See documentation for the return type for the exact shape of the output arrays for each prediction mode.
 #' @param avoid_transpose Whether to output the resulting predictions in the same memory layout in which they
-#'        are generated by the core XGBoost library, without transposing them to match the expected output shape.
+#'   are generated by the core XGBoost library, without transposing them to match the expected output shape.
 #'
-#'        Internally, XGBoost uses row-major order for the predictions it generates, while R arrays use column-major
-#'        order, hence the result needs to be transposed in order to have the expected shape when represented as
-#'        an R array or matrix, which might be a slow operation.
+#'   Internally, XGBoost uses row-major order for the predictions it generates, while R arrays use column-major
+#'   order, hence the result needs to be transposed in order to have the expected shape when represented as
+#'   an R array or matrix, which might be a slow operation.
 #'
-#'        If passing `TRUE`, then the result will have dimensions in reverse order - for example, rows
-#'        will be the last dimensions instead of the first dimension.
+#'   If passing `TRUE`, then the result will have dimensions in reverse order - for example, rows
+#'   will be the last dimensions instead of the first dimension.
 #' @param base_margin Base margin used for boosting from existing model.
 #'
-#'        Note that, if `newdata` is an `xgb.DMatrix` object, this argument will
-#'        be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
-#'        an argument in its constructor, or by calling \link{setinfo.xgb.DMatrix}).
+#'   Note that, if `newdata` is an `xgb.DMatrix` object, this argument will
+#'   be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
+#'   an argument in its constructor, or by calling [setinfo.xgb.DMatrix()].
+#' @param validate_features When `TRUE`, validate that the Booster's and newdata's
+#'   feature_names match (only applicable when both `object` and `newdata` have feature names).
 #'
-#' @param validate_features When `TRUE`, validate that the Booster's and newdata's feature_names
-#'        match (only applicable when both `object` and `newdata` have feature names).
+#'   If the column names differ and `newdata` is not an `xgb.DMatrix`, will try to reorder
+#'   the columns in `newdata` to match with the booster's.
 #'
-#'        If the column names differ and `newdata` is not an `xgb.DMatrix`, will try to reorder
-#'        the columns in `newdata` to match with the booster's.
+#'   If the booster has feature types and `newdata` is either an `xgb.DMatrix` or
+#'   `data.frame`, will additionally verify that categorical columns are of the
+#'   correct type in `newdata`, throwing an error if they do not match.
 #'
-#'        If the booster has feature types and `newdata` is either an `xgb.DMatrix` or `data.frame`,
-#'        will additionally verify that categorical columns are of the correct type in `newdata`,
-#'        throwing an error if they do not match.
+#'   If passing `FALSE`, it is assumed that the feature names and types are the same,
+#'   and come in the same order as in the training data.
 #'
-#'        If passing `FALSE`, it is assumed that the feature names and types are the same,
-#'        and come in the same order as in the training data.
-#'
-#'        Note that this check might add some sizable latency to the predictions, so it's
-#'        recommended to disable it for performance-sensitive applications.
+#'   Note that this check might add some sizable latency to the predictions, so it's
+#'   recommended to disable it for performance-sensitive applications.
 #' @param ... Not used.
 #'
 #' @details
-#'
 #' Note that `iterationrange` would currently do nothing for predictions from "gblinear",
 #' since "gblinear" doesn't keep its boosting history.
 #'
@@ -589,41 +588,40 @@ validate.features <- function(bst, newdata) {
 }
 
 
-#' @title Accessors for serializable attributes of a model
+#' Accessors for serializable attributes of a model
 #'
-#' @description These methods allow to manipulate the key-value attribute strings of an xgboost model.
-#'
-#' @param object Object of class `xgb.Booster`. \bold{Will be modified in-place} when assigning to it.
-#' @param name A non-empty character string specifying which attribute is to be accessed.
-#' @param value For `xgb.attr<-`, a value of an attribute; for `xgb.attributes<-`,
-#'        it is a list (or an object coercible to a list) with the names of attributes to set
-#'        and the elements corresponding to attribute values.
-#'        Non-character values are converted to character.
-#'        When an attribute value is not a scalar, only the first index is used.
-#'        Use `NULL` to remove an attribute.
+#' These methods allow to manipulate the key-value attribute strings of an XGBoost model.
 #'
 #' @details
-#' The primary purpose of xgboost model attributes is to store some meta data about the model.
+#' The primary purpose of XGBoost model attributes is to store some meta data about the model.
 #' Note that they are a separate concept from the object attributes in R.
-#' Specifically, they refer to key-value strings that can be attached to an xgboost model,
+#' Specifically, they refer to key-value strings that can be attached to an XGBoost model,
 #' stored together with the model's binary representation, and accessed later
 #' (from R or any other interface).
 #' In contrast, any R attribute assigned to an R object of `xgb.Booster` class
-#' would not be saved by [xgb.save()] because an xgboost model is an external memory object
+#' would not be saved by [xgb.save()] because an XGBoost model is an external memory object
 #' and its serialization is handled externally.
-#' Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't
+#' Also, setting an attribute that has the same name as one of XGBoost's parameters wouldn't
 #' change the value of that parameter for a model.
 #' Use [xgb.parameters<-()] to set or change model parameters.
 #'
-#' The `xgb.attributes<-` setter either updates the existing or adds one or several attributes,
+#' The [xgb.attributes<-()] setter either updates the existing or adds one or several attributes,
 #' but it doesn't delete the other existing attributes.
 #'
 #' Important: since this modifies the booster's C object, semantics for assignment here
 #' will differ from R's, as any object reference to the same booster will be modified
 #' too, while assignment of R attributes through `attributes(model)$<attr> <- <value>`
-#' will follow the usual copy-on-write R semantics (see \link{xgb.copy.Booster} for an
+#' will follow the usual copy-on-write R semantics (see [xgb.copy.Booster()] for an
 #' example of these behaviors).
 #'
+#' @param object Object of class `xgb.Booster`. **Will be modified in-place** when assigning to it.
+#' @param name A non-empty character string specifying which attribute is to be accessed.
+#' @param value For `xgb.attr<-`, a value of an attribute; for `xgb.attributes<-`,
+#'        it is a list (or an object coercible to a list) with the names of attributes to set
+#'        and the elements corresponding to attribute values.
+#'        Non-character values are converted to character.
+#'        When an attribute value is not a scalar, only the first index is used.
+#'        Use `NULL` to remove an attribute.
 #' @return
 #' - `xgb.attr()` returns either a string value of an attribute
 #'   or `NULL` if an attribute wasn't stored in a model.
@@ -720,15 +718,18 @@ xgb.attributes <- function(object) {
   return(object)
 }
 
-#' @title Accessors for model parameters as JSON string
-#' @details Note that assignment is performed in-place on the booster C object, which unlike assignment
+#' Accessors for model parameters as JSON string
+#'
+#' @details
+#' Note that assignment is performed in-place on the booster C object, which unlike assignment
 #' of R attributes, doesn't follow typical copy-on-write semantics for assignment - i.e. all references
 #' to the same booster will also get updated.
 #'
-#' See \link{xgb.copy.Booster} for an example of this behavior.
-#' @param object Object of class `xgb.Booster`. \bold{Will be modified in-place} when assigning to it.
-#' @param value An R list.
-#' @return `xgb.config` will return the parameters as an R list.
+#' See [xgb.copy.Booster()] for an example of this behavior.
+#'
+#' @param object Object of class `xgb.Booster`.**Will be modified in-place** when assigning to it.
+#' @param value A list.
+#' @return Parameters as a list.
 #' @examples
 #' data(agaricus.train, package = "xgboost")
 #'
@@ -767,23 +768,27 @@ xgb.config <- function(object) {
   return(object)
 }
 
-#' @title Accessors for model parameters
-#' @description Only the setter for xgboost parameters is currently implemented.
-#' @details Just like \link{xgb.attr}, this function will make in-place modifications
+#' Accessors for model parameters
+#'
+#' Only the setter for XGBoost parameters is currently implemented.
+#'
+#' @details
+#' Just like [xgb.attr()], this function will make in-place modifications
 #' on the booster object which do not follow typical R assignment semantics - that is,
 #' all references to the same booster will also be updated, unlike assingment of R
 #' attributes which follow copy-on-write semantics.
 #'
-#' See \link{xgb.copy.Booster} for an example of this behavior.
+#' See [xgb.copy.Booster()] for an example of this behavior.
 #'
 #' Be aware that setting parameters of a fitted booster related to training continuation / updates
 #' will reset its number of rounds indicator to zero.
-#' @param object Object of class `xgb.Booster`. \bold{Will be modified in-place}.
+#' @param object Object of class `xgb.Booster`. **Will be modified in-place**.
 #' @param value A list (or an object coercible to a list) with the names of parameters to set
 #'        and the elements corresponding to parameter values.
 #' @return The same booster `object`, which gets modified in-place.
 #' @examples
 #' data(agaricus.train, package = "xgboost")
+#'
 #' train <- agaricus.train
 #'
 #' bst <- xgb.train(
@@ -859,11 +864,12 @@ setinfo.xgb.Booster <- function(object, name, info) {
   return(TRUE)
 }
 
-#' @title Get number of boosting in a fitted booster
+#' Get number of boosting in a fitted booster
+#'
 #' @param model,x A fitted `xgb.Booster` model.
-#' @return The number of rounds saved in the model, as an integer.
+#' @return The number of rounds saved in the model as an integer.
 #' @details Note that setting booster parameters related to training
-#' continuation / updates through \link{xgb.parameters<-} will reset the
+#' continuation / updates through [xgb.parameters<-()] will reset the
 #' number of rounds to zero.
 #' @export
 #' @rdname xgb.get.num.boosted.rounds
@@ -877,16 +883,19 @@ length.xgb.Booster <- function(x) {
   return(xgb.get.num.boosted.rounds(x))
 }
 
-#' @title Slice Booster by Rounds
-#' @description Creates a new booster including only a selected range of rounds / iterations
+#' Slice Booster by Rounds
+#'
+#' Creates a new booster including only a selected range of rounds / iterations
 #' from an existing booster, as given by the sequence `seq(start, end, step)`.
-#' @details Note that any R attributes that the booster might have, will not be copied into
+#'
+#' @details
+#' Note that any R attributes that the booster might have, will not be copied into
 #' the resulting object.
+#'
 #' @param model,x A fitted `xgb.Booster` object, which is to be sliced by taking only a subset
 #' of its rounds / iterations.
-#' @param start Start of the slice (base-1 and inclusive, like R's \link{seq}).
-#' @param end End of the slice (base-1 and inclusive, like R's \link{seq}).
-#'
+#' @param start Start of the slice (base-1 and inclusive, like R's [seq()]).
+#' @param end End of the slice (base-1 and inclusive, like R's [seq()]).
 #' Passing a value of zero here is equivalent to passing the full number of rounds in the
 #' booster object.
 #' @param step Step size of the slice. Passing '1' will take every round in the sequence defined by
@@ -894,8 +903,10 @@ length.xgb.Booster <- function(x) {
 #' @return A sliced booster object containing only the requested rounds.
 #' @examples
 #' data(mtcars)
+#'
 #' y <- mtcars$mpg
 #' x <- as.matrix(mtcars[, -1])
+#'
 #' dm <- xgb.DMatrix(x, label = y, nthread = 1)
 #' model <- xgb.train(data = dm, params = list(nthread = 1), nrounds = 5)
 #' model_slice <- xgb.slice.Booster(model, 1, 3)
@@ -948,10 +959,12 @@ xgb.slice.Booster <- function(model, start, end = xgb.get.num.boosted.rounds(mod
   return(xgb.slice.Booster(x, i[1L], i[length(i)], steps[1L]))
 }
 
-#' @title Get Features Names from Booster
-#' @description Returns the feature / variable / column names from a fitted
-#' booster object, which are set automatically during the call to \link{xgb.train}
-#' from the DMatrix names, or which can be set manually through \link{setinfo}.
+#' Get Features Names from Booster
+#'
+#' @description
+#' Returns the feature / variable / column names from a fitted
+#' booster object, which are set automatically during the call to [xgb.train()]
+#' from the DMatrix names, or which can be set manually through [setinfo()].
 #'
 #' If the object doesn't have feature names, will return `NULL`.
 #'
@@ -1002,23 +1015,25 @@ xgb.best_iteration <- function(bst) {
   return(out)
 }
 
-#' @title Extract coefficients from linear booster
-#' @description Extracts the coefficients from a 'gblinear' booster object,
-#' as produced by \code{xgb.train} when using parameter `booster="gblinear"`.
+#' Extract coefficients from linear booster
+#'
+#' @description
+#' Extracts the coefficients from a 'gblinear' booster object,
+#' as produced by [xgb.train()] when using parameter `booster="gblinear"`.
 #'
 #' Note: this function will error out if passing a booster model
 #' which is not of "gblinear" type.
+#'
 #' @param object A fitted booster of 'gblinear' type.
 #' @param ... Not used.
-#' @return The extracted coefficients:\itemize{
-#' \item If there's only one coefficient per column in the data, will be returned as a
-#' vector, potentially containing the feature names if available, with the intercept
-#' as first column.
-#' \item If there's more than one coefficient per column in the data (e.g. when using
-#' `objective="multi:softmax"`), will be returned as a matrix with dimensions equal
-#' to `[num_features, num_cols]`, with the intercepts as first row. Note that the column
-#' (classes in multi-class classification) dimension will not be named.
-#' }
+#' @return The extracted coefficients:
+#'   - If there is only one coefficient per column in the data, will be returned as a
+#'     vector, potentially containing the feature names if available, with the intercept
+#'     as first column.
+#'   - If there is more than one coefficient per column in the data (e.g. when using
+#'     `objective="multi:softmax"`), will be returned as a matrix with dimensions equal
+#'     to `[num_features, num_cols]`, with the intercepts as first row. Note that the column
+#'     (classes in multi-class classification) dimension will not be named.
 #'
 #' The intercept returned here will include the 'base_score' parameter (unlike the 'bias'
 #' or the last coefficient in the model dump, which doesn't have 'base_score' added to it),
@@ -1027,12 +1042,15 @@ xgb.best_iteration <- function(bst) {
 #'
 #' Be aware that the coefficients are obtained by first converting them to strings and
 #' back, so there will always be some very small lose of precision compared to the actual
-#' coefficients as used by \link{predict.xgb.Booster}.
+#' coefficients as used by [predict.xgb.Booster].
 #' @examples
 #' library(xgboost)
+#'
 #' data(mtcars)
+#'
 #' y <- mtcars[, 1]
 #' x <- as.matrix(mtcars[, -1])
+#'
 #' dm <- xgb.DMatrix(data = x, label = y, nthread = 1)
 #' params <- list(booster = "gblinear", nthread = 1)
 #' model <- xgb.train(data = dm, params = params, nrounds = 2)
@@ -1088,19 +1106,25 @@ coef.xgb.Booster <- function(object, ...) {
   return(out)
 }
 
-#' @title Deep-copies a Booster Object
-#' @description Creates a deep copy of an 'xgb.Booster' object, such that the
+#' Deep-copies a Booster Object
+#'
+#' Creates a deep copy of an 'xgb.Booster' object, such that the
 #' C object pointer contained will be a different object, and hence functions
-#' like \link{xgb.attr} will not affect the object from which it was copied.
+#' like [xgb.attr()] will not affect the object from which it was copied.
+#'
 #' @param model An 'xgb.Booster' object.
 #' @return A deep copy of `model` - it will be identical in every way, but C-level
-#' functions called on that copy will not affect the `model` variable.
+#'   functions called on that copy will not affect the `model` variable.
 #' @examples
 #' library(xgboost)
+#'
 #' data(mtcars)
+#'
 #' y <- mtcars$mpg
 #' x <- mtcars[, -1]
+#'
 #' dm <- xgb.DMatrix(x, label = y, nthread = 1)
+#'
 #' model <- xgb.train(
 #'   data = dm,
 #'   params = list(nthread = 1),
@@ -1135,29 +1159,35 @@ xgb.copy.Booster <- function(model) {
   return(.Call(XGDuplicate_R, model))
 }
 
-#' @title Check if two boosters share the same C object
-#' @description Checks whether two booster objects refer to the same underlying C object.
-#' @details As booster objects (as returned by e.g. \link{xgb.train}) contain an R 'externalptr'
+#' Check if two boosters share the same C object
+#'
+#' Checks whether two booster objects refer to the same underlying C object.
+#'
+#' @details
+#' As booster objects (as returned by e.g. [xgb.train()]) contain an R 'externalptr'
 #' object, they don't follow typical copy-on-write semantics of other R objects - that is, if
 #' one assigns a booster to a different variable and modifies that new variable through in-place
-#' methods like \link{xgb.attr<-}, the modification will be applied to both the old and the new
+#' methods like [xgb.attr<-()], the modification will be applied to both the old and the new
 #' variable, unlike typical R assignments which would only modify the latter.
 #'
 #' This function allows checking whether two booster objects share the same 'externalptr',
 #' regardless of the R attributes that they might have.
 #'
 #' In order to duplicate a booster in such a way that the copy wouldn't share the same
-#' 'externalptr', one can use function \link{xgb.copy.Booster}.
+#' 'externalptr', one can use function [xgb.copy.Booster()].
 #' @param obj1 Booster model to compare with `obj2`.
 #' @param obj2 Booster model to compare with `obj1`.
-#' @return Either `TRUE` or `FALSE` according to whether the two boosters share
-#' the underlying C object.
-#' @seealso \link{xgb.copy.Booster}
+#' @return Either `TRUE` or `FALSE` according to whether the two boosters share the
+#'   underlying C object.
+#' @seealso [xgb.copy.Booster()]
 #' @examples
 #' library(xgboost)
+#'
 #' data(mtcars)
+#'
 #' y <- mtcars$mpg
 #' x <- as.matrix(mtcars[, -1])
+#'
 #' model <- xgb.train(
 #'   params = list(nthread = 1),
 #'   data = xgb.DMatrix(x, label = y, nthread = 1),
@@ -1210,10 +1240,10 @@ xgb.is.same.Booster <- function(obj1, obj2) {
 #' attr(bst, "myattr") <- "memo"
 #'
 #' print(bst)
-#'
+#' @method print xgb.Booster
 #' @export
 print.xgb.Booster <- function(x, ...) {
-  # this lets it error out when the object comes from an earlier R xgboost version
+  # this lets it error out when the object comes from an earlier R XGBoost version
   handle <- xgb.get.handle(x)
   cat('##### xgb.Booster\n')
 
diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
index d87d1cbf71c2..965aec49d79a 100644
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -60,17 +60,17 @@
 #' so it doesn't make sense to assign weights to individual data points.
 #' @param base_margin Base margin used for boosting from existing model.
 #'
-#'        In the case of multi-output models, one can also pass multi-dimensional base_margin.
+#'   In the case of multi-output models, one can also pass multi-dimensional base_margin.
 #' @param missing A float value to represents missing values in data (not used when creating DMatrix
-#'        from text files).
-#'        It is useful to change when a zero, infinite, or some other extreme value represents missing
-#'        values in data.
+#'   from text files).
+#'   It is useful to change when a zero, infinite, or some other extreme value represents missing
+#'   values in data.
 #' @param silent whether to suppress printing an informational message after loading from a file.
 #' @param feature_names Set names for features. Overrides column names in data
-#'        frame and matrix.
+#'   frame and matrix.
 #'
-#'        Note: columns are not referenced by name when calling `predict`, so the column order there
-#'        must be the same as in the DMatrix construction, regardless of the column names.
+#'   Note: columns are not referenced by name when calling `predict`, so the column order there
+#'   must be the same as in the DMatrix construction, regardless of the column names.
 #' @param feature_types Set types for features.
 #'
 #' If `data` is a `data.frame` and passing `feature_types` is not supplied, feature types will be deduced
diff --git a/R-package/R/xgb.create.features.R b/R-package/R/xgb.create.features.R
index 27f8a0975ae7..f9d892caa1e5 100644
--- a/R-package/R/xgb.create.features.R
+++ b/R-package/R/xgb.create.features.R
@@ -1,20 +1,15 @@
 #' Create new features from a previously learned model
 #'
-#' May improve the learning by adding new features to the training data based on the decision trees from a previously learned model.
-#'
-#' @param model decision tree boosting model learned on the original data
-#' @param data original data (usually provided as a \code{dgCMatrix} matrix)
-#' @param ... currently not used
-#'
-#' @return \code{dgCMatrix} matrix including both the original data and the new features.
+#' May improve the learning by adding new features to the training data based on the
+#' decision trees from a previously learned model.
 #'
 #' @details
 #' This is the function inspired from the paragraph 3.1 of the paper:
 #'
-#' \strong{Practical Lessons from Predicting Clicks on Ads at Facebook}
+#' **Practical Lessons from Predicting Clicks on Ads at Facebook**
 #'
-#' \emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers,
-#' Joaquin Quinonero Candela)}
+#' *(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers,
+#' Joaquin Quinonero Candela)*
 #'
 #' International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
 #'
@@ -33,11 +28,11 @@
 #' where the first subtree has 3 leafs and the second 2 leafs. If an
 #' instance ends up in leaf 2 in the first subtree and leaf 1 in
 #' second subtree, the overall input to the linear classifier will
-#' be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
+#' be the binary vector `[0, 1, 0, 1, 0]`, where the first 3 entries
 #' correspond to the leaves of the first subtree and last 2 to
 #' those of the second subtree.
 #'
-#' [...]
+#' ...
 #'
 #' We can understand boosted decision tree
 #' based transformation as a supervised feature encoding that
@@ -45,16 +40,23 @@
 #' vector. A traversal from root node to a leaf node represents
 #' a rule on certain features."
 #'
+#' @param model Decision tree boosting model learned on the original data.
+#' @param data Original data (usually provided as a `dgCMatrix` matrix).
+#' @param ... Currently not used.
+#'
+#' @return A `dgCMatrix` matrix including both the original data and the new features.
+#'
 #' @examples
-#' data(agaricus.train, package='xgboost')
-#' data(agaricus.test, package='xgboost')
+#' data(agaricus.train, package = "xgboost")
+#' data(agaricus.test, package = "xgboost")
+#'
 #' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 #' dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))
 #'
-#' param <- list(max_depth=2, eta=1, objective='binary:logistic')
+#' param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic')
 #' nrounds = 4
 #'
-#' bst = xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
+#' bst <- xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
 #'
 #' # Model accuracy without new features
 #' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) /
diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R
index ef7202a1a5db..76271ec515c9 100644
--- a/R-package/R/xgb.dump.R
+++ b/R-package/R/xgb.dump.R
@@ -1,36 +1,44 @@
-#' Dump an xgboost model in text format.
+#' Dump an XGBoost model in text format.
 #'
-#' Dump an xgboost model in text format.
+#' Dump an XGBoost model in text format.
 #'
-#' @param model the model object.
-#' @param fname the name of the text file where to save the model text dump.
-#'        If not provided or set to \code{NULL}, the model is returned as a \code{character} vector.
-#' @param fmap feature map file representing feature types.
-#'        See demo/ for walkthrough example in R, and
-#'        \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
-#'        for example Format.
-#' @param with_stats whether to dump some additional statistics about the splits.
-#'        When this option is on, the model dump contains two additional values:
-#'        gain is the approximate loss function gain we get in each split;
-#'        cover is the sum of second order gradient in each node.
-#' @param dump_format either 'text', 'json', or 'dot' (graphviz) format could be specified.
+#' @param model The model object.
+#' @param fname The name of the text file where to save the model text dump.
+#'   If not provided or set to `NULL`, the model is returned as a character vector.
+#' @param fmap Feature map file representing feature types. See demo/ for a walkthrough
+#'   example in R, and \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
+#'   to see an example of the value.
+#' @param with_stats Whether to dump some additional statistics about the splits.
+#'   When this option is on, the model dump contains two additional values:
+#'   gain is the approximate loss function gain we get in each split;
+#'   cover is the sum of second order gradient in each node.
+#' @param dump_format Either 'text', 'json', or 'dot' (graphviz) format could be specified.
 #'
-#' Format 'dot' for a single tree can be passed directly to packages that consume this format
-#' for graph visualization, such as function [DiagrammeR::grViz()]
-#' @param ... currently not used
+#'   Format 'dot' for a single tree can be passed directly to packages that consume this format
+#'   for graph visualization, such as function `DiagrammeR::grViz()`
+#' @param ... Currently not used
 #'
 #' @return
-#' If fname is not provided or set to \code{NULL} the function will return the model
-#' as a \code{character} vector. Otherwise it will return \code{TRUE}.
+#' If fname is not provided or set to `NULL` the function will return the model
+#' as a character vector. Otherwise it will return `TRUE`.
 #'
 #' @examples
 #' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
-#' data(agaricus.train, package='xgboost')
-#' data(agaricus.test, package='xgboost')
+#' data(agaricus.train, package = "xgboost")
+#' data(agaricus.test, package = "xgboost")
+#'
 #' train <- agaricus.train
 #' test <- agaricus.test
-#' bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
-#'                  eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+#'
+#' bst <- xgb.train(
+#'   data = xgb.DMatrix(train$data, label = train$label),
+#'   max_depth = 2,
+#'   eta = 1,
+#'   nthread = 2,
+#'   nrounds = 2,
+#'   objective = "binary:logistic"
+#' )
+#'
 #' # save the model in file 'xgb.model.dump'
 #' dump_path = file.path(tempdir(), 'model.dump')
 #' xgb.dump(bst, dump_path, with_stats = TRUE)
@@ -39,7 +47,7 @@
 #' print(xgb.dump(bst, with_stats = TRUE))
 #'
 #' # print in JSON format:
-#' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
+#' cat(xgb.dump(bst, with_stats = TRUE, dump_format = "json"))
 #'
 #' # plot first tree leveraging the 'dot' format
 #' if (requireNamespace('DiagrammeR', quietly = TRUE)) {
diff --git a/R-package/R/xgb.ggplot.R b/R-package/R/xgb.ggplot.R
index 1fe30ba2b4b6..3e2e6e8e9603 100644
--- a/R-package/R/xgb.ggplot.R
+++ b/R-package/R/xgb.ggplot.R
@@ -1,6 +1,5 @@
 # ggplot backend for the xgboost plotting facilities
 
-
 #' @rdname xgb.plot.importance
 #' @export
 xgb.ggplot.importance <- function(importance_matrix = NULL, top_n = NULL, measure = NULL,
@@ -135,8 +134,7 @@ xgb.ggplot.shap.summary <- function(data, shap_contrib = NULL, features = NULL,
 #' @param data_list The result of `xgb.shap.data()`.
 #' @param normalize Whether to standardize feature values to mean 0 and
 #'   standard deviation 1. This is useful for comparing multiple features on the same
-#'   plot. Default is \code{FALSE}.
-#'
+#'   plot. Default is `FALSE`.
 #' @return A `data.table` containing the observation ID, the feature name, the
 #'   feature value (normalized if specified), and the SHAP contribution value.
 #' @noRd
@@ -167,7 +165,6 @@ prepare.ggplot.shap.data <- function(data_list, normalize = FALSE) {
 #' Useful to compare multiple features on the same plot.
 #'
 #' @param x Numeric vector.
-#'
 #' @return Numeric vector with mean 0 and standard deviation 1.
 #' @noRd
 #' @keywords internal
diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R
index bbf816a0d6cc..548421d2c83c 100644
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@@ -2,27 +2,25 @@
 #'
 #' Creates a `data.table` of feature importances.
 #'
-#' @param feature_names Character vector used to overwrite the feature names
-#'        of the model. The default is `NULL` (use original feature names).
-#' @param model Object of class `xgb.Booster`.
-#' @param trees An integer vector of tree indices that should be included
-#'        into the importance calculation (only for the "gbtree" booster).
-#'        The default (`NULL`) parses all trees.
-#'        It could be useful, e.g., in multiclass classification to get feature importances
-#'        for each class separately. *Important*: the tree index in XGBoost models
-#'        is zero-based (e.g., use `trees = 0:4` for the first five trees).
-#' @param data Deprecated.
-#' @param label Deprecated.
-#' @param target Deprecated.
-#'
 #' @details
-#'
 #' This function works for both linear and tree models.
 #'
 #' For linear models, the importance is the absolute magnitude of linear coefficients.
 #' To obtain a meaningful ranking by importance for linear models, the features need to
 #' be on the same scale (which is also recommended when using L1 or L2 regularization).
 #'
+#' @param feature_names Character vector used to overwrite the feature names
+#'   of the model. The default is `NULL` (use original feature names).
+#' @param model Object of class `xgb.Booster`.
+#' @param trees An integer vector of tree indices that should be included
+#'   into the importance calculation (only for the "gbtree" booster).
+#'   The default (`NULL`) parses all trees.
+#'   It could be useful, e.g., in multiclass classification to get feature importances
+#'   for each class separately. *Important*: the tree index in XGBoost models
+#'   is zero-based (e.g., use `trees = 0:4` for the first five trees).
+#' @param data Deprecated.
+#' @param label Deprecated.
+#' @param target Deprecated.
 #' @return A `data.table` with the following columns:
 #'
 #' For a tree model:
diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R
index d5b192bcb6fa..29ab2dadaf72 100644
--- a/R-package/R/xgb.load.R
+++ b/R-package/R/xgb.load.R
@@ -1,28 +1,27 @@
-#' Load xgboost model from binary file
+#' Load XGBoost model from binary file
 #'
-#' Load xgboost model from the binary model file.
+#' Load XGBoost model from binary model file.
 #'
-#' @param modelfile the name of the binary input file.
+#' @param modelfile The name of the binary input file.
 #'
 #' @details
-#' The input file is expected to contain a model saved in an xgboost model format
-#' using either \code{\link{xgb.save}} or \code{\link{xgb.cb.save.model}} in R, or using some
-#' appropriate methods from other xgboost interfaces. E.g., a model trained in Python and
-#' saved from there in xgboost format, could be loaded from R.
+#' The input file is expected to contain a model saved in an XGBoost model format
+#' using either [xgb.save()] in R, or using some
+#' appropriate methods from other XGBoost interfaces. E.g., a model trained in Python and
+#' saved from there in XGBoost format, could be loaded from R.
 #'
-#' Note: a model saved as an R-object, has to be loaded using corresponding R-methods,
-#' not \code{xgb.load}.
+#' Note: a model saved as an R object has to be loaded using corresponding R-methods,
+#' not by [xgb.load()].
 #'
 #' @return
-#' An object of \code{xgb.Booster} class.
+#' An object of `xgb.Booster` class.
 #'
-#' @seealso
-#' \code{\link{xgb.save}}
+#' @seealso [xgb.save()]
 #'
 #' @examples
 #' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
-#' data(agaricus.train, package='xgboost')
-#' data(agaricus.test, package='xgboost')
+#' data(agaricus.train, package = "xgboost")
+#' data(agaricus.test, package = "xgboost")
 #'
 #' ## Keep the number of threads to 1 for examples
 #' nthread <- 1
@@ -30,6 +29,7 @@
 #'
 #' train <- agaricus.train
 #' test <- agaricus.test
+#'
 #' bst <- xgb.train(
 #'   data = xgb.DMatrix(train$data, label = train$label),
 #'   max_depth = 2,
diff --git a/R-package/R/xgb.load.raw.R b/R-package/R/xgb.load.raw.R
index 73ac50dc6662..87607deab7f8 100644
--- a/R-package/R/xgb.load.raw.R
+++ b/R-package/R/xgb.load.raw.R
@@ -1,8 +1,8 @@
-#' Load serialised xgboost model from R's raw vector
+#' Load serialised XGBoost model from R's raw vector
 #'
-#' User can generate raw memory buffer by calling xgb.save.raw
+#' User can generate raw memory buffer by calling [xgb.save.raw()].
 #'
-#' @param buffer the buffer returned by xgb.save.raw
+#' @param buffer The buffer returned by [xgb.save.raw()].
 #' @export
 xgb.load.raw <- function(buffer) {
   cachelist <- list()
diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R
index eb64b1eabb4f..36e7af212a51 100644
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -2,18 +2,17 @@
 #'
 #' Parse a boosted tree model text dump into a `data.table` structure.
 #'
-#' @param model Object of class `xgb.Booster`. If it contains feature names (they can be set through
-#'        \link{setinfo}), they will be used in the output from this function.
+#' @param model Object of class `xgb.Booster`. If it contains feature names (they can
+#'   be set through [setinfo()]), they will be used in the output from this function.
 #' @param text Character vector previously generated by the function [xgb.dump()]
-#'        (called with parameter `with_stats = TRUE`). `text` takes precedence over `model`.
-#' @param trees An integer vector of tree indices that should be used.
-#'        The default (`NULL`) uses all trees.
-#'        Useful, e.g., in multiclass classification to get only
-#'        the trees of one class. *Important*: the tree index in XGBoost models
-#'        is zero-based (e.g., use `trees = 0:4` for the first five trees).
+#'   (called with parameter `with_stats = TRUE`). `text` takes precedence over `model`.
+#' @param trees An integer vector of tree indices that should be used. The default
+#'   (`NULL`) uses all trees. Useful, e.g., in multiclass classification to get only
+#'   the trees of one class. *Important*: the tree index in XGBoost models
+#'   is zero-based (e.g., use `trees = 0:4` for the first five trees).
 #' @param use_int_id A logical flag indicating whether nodes in columns "Yes", "No", and
-#'        "Missing" should be represented as integers (when `TRUE`) or as "Tree-Node"
-#'        character strings (when `FALSE`, default).
+#'   "Missing" should be represented as integers (when `TRUE`) or as "Tree-Node"
+#'   character strings (when `FALSE`, default).
 #' @param ... Currently not used.
 #'
 #' @return
@@ -195,7 +194,7 @@ xgb.model.dt.tree <- function(model = NULL, text = NULL,
   td[order(Tree, Node)]
 }
 
-# Avoid error messages during CRAN check.
+# Avoid notes during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
 globalVariables(c("Tree", "Node", "ID", "Feature", "t", "isLeaf", ".SD", ".SDcols"))
diff --git a/R-package/R/xgb.plot.deepness.R b/R-package/R/xgb.plot.deepness.R
index 956ee9c83fd0..c8aa92f22f6b 100644
--- a/R-package/R/xgb.plot.deepness.R
+++ b/R-package/R/xgb.plot.deepness.R
@@ -4,7 +4,8 @@
 #' - `xgb.plot.deepness()` uses base R graphics, while
 #' - `xgb.ggplot.deepness()` uses "ggplot2".
 #'
-#' @param model Either an `xgb.Booster` model, or the "data.table" returned by [xgb.model.dt.tree()].
+#' @param model Either an `xgb.Booster` model, or the "data.table" returned
+#'   by [xgb.model.dt.tree()].
 #' @param which Which distribution to plot (see details).
 #' @param plot Should the plot be shown? Default is `TRUE`.
 #' @param ... Other parameters passed to [graphics::barplot()] or [graphics::plot()].
diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R
index 199595cb8ddf..11be29a7cb68 100644
--- a/R-package/R/xgb.plot.importance.R
+++ b/R-package/R/xgb.plot.importance.R
@@ -4,25 +4,9 @@
 #' - `xgb.plot.importance()` uses base R graphics, while
 #' - `xgb.ggplot.importance()` uses "ggplot".
 #'
-#' @param importance_matrix A `data.table` as returned by [xgb.importance()].
-#' @param top_n Maximal number of top features to include into the plot.
-#' @param measure The name of importance measure to plot.
-#'        When `NULL`, 'Gain' would be used for trees and 'Weight' would be used for gblinear.
-#' @param rel_to_first Whether importance values should be represented as relative to
-#'        the highest ranked feature, see Details.
-#' @param left_margin Adjust the left margin size to fit feature names.
-#'        When `NULL`, the existing `par("mar")` is used.
-#' @param cex Passed as `cex.names` parameter to [graphics::barplot()].
-#' @param plot Should the barplot be shown? Default is `TRUE`.
-#' @param n_clusters A numeric vector containing the min and the max range
-#'        of the possible number of clusters of bars.
-#' @param ... Other parameters passed to [graphics::barplot()]
-#'        (except `horiz`, `border`, `cex.names`, `names.arg`, and `las`).
-#'        Only used in `xgb.plot.importance()`.
-#'
 #' @details
-#' The graph represents each feature as a horizontal bar of length proportional to the importance of a feature.
-#' Features are sorted by decreasing importance.
+#' The graph represents each feature as a horizontal bar of length proportional to the
+#' importance of a feature. Features are sorted by decreasing importance.
 #' It works for both "gblinear" and "gbtree" models.
 #'
 #' When `rel_to_first = FALSE`, the values would be plotted as in `importance_matrix`.
@@ -35,6 +19,21 @@
 #' The "ggplot" backend performs 1-D clustering of the importance values,
 #' with bar colors corresponding to different clusters having similar importance values.
 #'
+#' @param importance_matrix A `data.table` as returned by [xgb.importance()].
+#' @param top_n Maximal number of top features to include into the plot.
+#' @param measure The name of importance measure to plot.
+#'   When `NULL`, 'Gain' would be used for trees and 'Weight' would be used for gblinear.
+#' @param rel_to_first Whether importance values should be represented as relative to
+#'   the highest ranked feature, see Details.
+#' @param left_margin Adjust the left margin size to fit feature names.
+#'   When `NULL`, the existing `par("mar")` is used.
+#' @param cex Passed as `cex.names` parameter to [graphics::barplot()].
+#' @param plot Should the barplot be shown? Default is `TRUE`.
+#' @param n_clusters A numeric vector containing the min and the max range
+#'   of the possible number of clusters of bars.
+#' @param ... Other parameters passed to [graphics::barplot()]
+#'   (except `horiz`, `border`, `cex.names`, `names.arg`, and `las`).
+#'   Only used in `xgb.plot.importance()`.
 #' @return
 #' The return value depends on the function:
 #' - `xgb.plot.importance()`: Invisibly, a "data.table" with `n_top` features sorted
diff --git a/R-package/R/xgb.plot.multi.trees.R b/R-package/R/xgb.plot.multi.trees.R
index 19a114071509..f3f6bddffa8e 100644
--- a/R-package/R/xgb.plot.multi.trees.R
+++ b/R-package/R/xgb.plot.multi.trees.R
@@ -2,12 +2,7 @@
 #'
 #' Visualization of the ensemble of trees as a single collective unit.
 #'
-#' @inheritParams xgb.plot.tree
-#' @param features_keep Number of features to keep in each position of the multi trees,
-#'        by default 5.
-#'
 #' @details
-#'
 #' This function tries to capture the complexity of a gradient boosted tree model
 #' in a cohesive way by compressing an ensemble of trees into a single tree-graph representation.
 #' The goal is to improve the interpretability of a model generally seen as black box.
@@ -25,6 +20,9 @@
 #' This function is inspired by this blog post:
 #' <https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/>
 #'
+#' @inheritParams xgb.plot.tree
+#' @param features_keep Number of features to keep in each position of the multi trees,
+#'   by default 5.
 #' @inherit xgb.plot.tree return
 #'
 #' @examples
diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R
index 20e8f3f4322e..79c2ed328a7a 100644
--- a/R-package/R/xgb.plot.shap.R
+++ b/R-package/R/xgb.plot.shap.R
@@ -4,42 +4,41 @@
 #'
 #' @param data The data to explain as a `matrix` or `dgCMatrix`.
 #' @param shap_contrib Matrix of SHAP contributions of `data`.
-#'        The default (`NULL`) computes it from `model` and `data`.
-#' @param features Vector of column indices or feature names to plot.
-#'        When `NULL` (default), the `top_n` most important features are selected
-#'        by [xgb.importance()].
+#'   The default (`NULL`) computes it from `model` and `data`.
+#' @param features Vector of column indices or feature names to plot. When `NULL`
+#'   (default), the `top_n` most important features are selected by [xgb.importance()].
 #' @param top_n How many of the most important features (<= 100) should be selected?
-#'        By default 1 for SHAP dependence and 10 for SHAP summary).
-#'        Only used when `features = NULL`.
+#'   By default 1 for SHAP dependence and 10 for SHAP summary.
+#'   Only used when `features = NULL`.
 #' @param model An `xgb.Booster` model. Only required when `shap_contrib = NULL` or
-#'        `features = NULL`.
+#'   `features = NULL`.
 #' @param trees Passed to [xgb.importance()] when `features = NULL`.
 #' @param target_class Only relevant for multiclass models. The default (`NULL`)
-#'        averages the SHAP values over all classes. Pass a (0-based) class index
-#'        to show only SHAP values of that class.
+#'   averages the SHAP values over all classes. Pass a (0-based) class index
+#'   to show only SHAP values of that class.
 #' @param approxcontrib Passed to `predict()` when `shap_contrib = NULL`.
 #' @param subsample Fraction of data points randomly picked for plotting.
-#'        The default (`NULL`) will use up to 100k data points.
+#'   The default (`NULL`) will use up to 100k data points.
 #' @param n_col Number of columns in a grid of plots.
 #' @param col Color of the scatterplot markers.
 #' @param pch Scatterplot marker.
 #' @param discrete_n_uniq Maximal number of unique feature values to consider the
-#'        feature as discrete.
+#'   feature as discrete.
 #' @param discrete_jitter Jitter amount added to the values of discrete features.
 #' @param ylab The y-axis label in 1D plots.
 #' @param plot_NA Should contributions of cases with missing values be plotted?
-#'        Default is `TRUE`.
+#'   Default is `TRUE`.
 #' @param col_NA Color of marker for missing value contributions.
 #' @param pch_NA Marker type for `NA` values.
 #' @param pos_NA Relative position of the x-location where `NA` values are shown:
-#'        `min(x) + (max(x) - min(x)) * pos_NA`.
+#'   `min(x) + (max(x) - min(x)) * pos_NA`.
 #' @param plot_loess Should loess-smoothed curves be plotted? (Default is `TRUE`).
-#'        The smoothing is only done for features with more than 5 distinct values.
+#'   The smoothing is only done for features with more than 5 distinct values.
 #' @param col_loess Color of loess curves.
 #' @param span_loess The `span` parameter of [stats::loess()].
 #' @param which Whether to do univariate or bivariate plotting. Currently, only "1d" is implemented.
 #' @param plot Should the plot be drawn? (Default is `TRUE`).
-#'        If `FALSE`, only a list of matrices is returned.
+#'   If `FALSE`, only a list of matrices is returned.
 #' @param ... Other parameters passed to [graphics::plot()].
 #'
 #' @details
@@ -120,6 +119,7 @@
 #' )
 #' trees0 <- seq(from = 0, by = nclass, length.out = nrounds)
 #' col <- rgb(0, 0, 1, 0.5)
+#'
 #' xgb.plot.shap(
 #'   x,
 #'   model = mbst,
diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R
index 502de3f52d61..5ae3f0227d1d 100644
--- a/R-package/R/xgb.plot.tree.R
+++ b/R-package/R/xgb.plot.tree.R
@@ -2,36 +2,7 @@
 #'
 #' Read a tree model text dump and plot the model.
 #'
-#' @param model Object of class `xgb.Booster`. If it contains feature names (they can be set through
-#'        \link{setinfo}), they will be used in the output from this function.
-#' @param trees An integer vector of tree indices that should be used.
-#'        The default (`NULL`) uses all trees.
-#'        Useful, e.g., in multiclass classification to get only
-#'        the trees of one class. *Important*: the tree index in XGBoost models
-#'        is zero-based (e.g., use `trees = 0:2` for the first three trees).
-#' @param plot_width,plot_height Width and height of the graph in pixels.
-#'        The values are passed to [DiagrammeR::render_graph()].
-#' @param render Should the graph be rendered or not? The default is `TRUE`.
-#' @param show_node_id a logical flag for whether to show node id's in the graph.
-#' @param style Style to use for the plot. Options are:\itemize{
-#' \item `"xgboost"`: will use the plot style defined in the core XGBoost library,
-#' which is shared between different interfaces through the 'dot' format. This
-#' style was not available before version 2.1.0 in R. It always plots the trees
-#' vertically (from top to bottom).
-#' \item `"R"`: will use the style defined from XGBoost's R interface, which predates
-#' the introducition of the standardized style from the core library. It might plot
-#' the trees horizontally (from left to right).
-#' }
-#'
-#' Note that `style="xgboost"` is only supported when all of the following conditions are met:\itemize{
-#' \item Only a single tree is being plotted.
-#' \item Node IDs are not added to the graph.
-#' \item The graph is being returned as `htmlwidget` (`render=TRUE`).
-#' }
-#' @param ... currently not used.
-#'
 #' @details
-#'
 #' When using `style="xgboost"`, the content of each node is visualized as follows:
 #' - For non-terminal nodes, it will display the split condition (number or name if
 #'   available, and the condition that would decide to which node to go next).
@@ -56,6 +27,31 @@
 #'
 #' This function uses [GraphViz](https://www.graphviz.org/) as DiagrammeR backend.
 #'
+#' @param model Object of class `xgb.Booster`. If it contains feature names (they can be set through
+#'   \link{setinfo}), they will be used in the output from this function.
+#' @param trees An integer vector of tree indices that should be used.
+#'   The default (`NULL`) uses all trees.
+#'   Useful, e.g., in multiclass classification to get only
+#'   the trees of one class. *Important*: the tree index in XGBoost models
+#'   is zero-based (e.g., use `trees = 0:2` for the first three trees).
+#' @param plot_width,plot_height Width and height of the graph in pixels.
+#'   The values are passed to `DiagrammeR::render_graph()`.
+#' @param render Should the graph be rendered or not? The default is `TRUE`.
+#' @param show_node_id a logical flag for whether to show node id's in the graph.
+#' @param style Style to use for the plot:
+#'   - `"xgboost"`: will use the plot style defined in the core XGBoost library,
+#'     which is shared between different interfaces through the 'dot' format. This
+#'     style was not available before version 2.1.0 in R. It always plots the trees
+#'     vertically (from top to bottom).
+#'   - `"R"`: will use the style defined from XGBoost's R interface, which predates
+#'     the introducition of the standardized style from the core library. It might plot
+#'     the trees horizontally (from left to right).
+#'
+#'   Note that `style="xgboost"` is only supported when all of the following conditions are met:
+#'   - Only a single tree is being plotted.
+#'   - Node IDs are not added to the graph.
+#'   - The graph is being returned as `htmlwidget` (`render=TRUE`).
+#' @param ... Currently not used.
 #' @return
 #' The value depends on the `render` parameter:
 #' - If `render = TRUE` (default): Rendered graph object which is an htmlwidget of
@@ -63,7 +59,7 @@
 #'   running from the command line.
 #' - If `render = FALSE`: Graph object which is of DiagrammeR's class `dgr_graph`.
 #'   This could be useful if one wants to modify some of the graph attributes
-#'   before rendering the graph with [DiagrammeR::render_graph()].
+#'   before rendering the graph with `DiagrammeR::render_graph()`.
 #'
 #' @examples
 #' data(agaricus.train, package = "xgboost")
diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R
index 91c545ff76fd..aac8cf40d328 100644
--- a/R-package/R/xgb.save.R
+++ b/R-package/R/xgb.save.R
@@ -1,43 +1,39 @@
-#' Save xgboost model to binary file
+#' Save XGBoost model to binary file
 #'
-#' Save xgboost model to a file in binary or JSON format.
+#' Save XGBoost model to a file in binary or JSON format.
 #'
 #' @param model Model object of \code{xgb.Booster} class.
-#' @param fname Name of the file to write.
-#'
-#' Note that the extension of this file name determined the serialization format to use:\itemize{
-#' \item Extension ".ubj" will use the universal binary JSON format (recommended).
-#' This format uses binary types for e.g. floating point numbers, thereby preventing any loss
-#' of precision when converting to a human-readable JSON text or similar.
-#' \item Extension ".json" will use plain JSON, which is a human-readable format.
-#' \item Extension ".deprecated" will use a \bold{deprecated} binary format. This format will
-#' not be able to save attributes introduced after v1 of XGBoost, such as the "best_iteration"
-#' attribute that boosters might keep, nor feature names or user-specifiec attributes.
-#' \item If the format is not specified by passing one of the file extensions above, will
-#' default to UBJ.
-#' }
+#' @param fname Name of the file to write. Its extension determines the serialization format:
+#'   - ".ubj": Use the universal binary JSON format (recommended).
+#'     This format uses binary types for e.g. floating point numbers, thereby preventing any loss
+#'     of precision when converting to a human-readable JSON text or similar.
+#'   - ".json": Use plain JSON, which is a human-readable format.
+#'   - ".deprecated": Use **deprecated** binary format. This format will
+#'     not be able to save attributes introduced after v1 of XGBoost, such as the "best_iteration"
+#'     attribute that boosters might keep, nor feature names or user-specifiec attributes.
+#'   - If the format is not specified by passing one of the file extensions above, will
+#'     default to UBJ.
 #'
 #' @details
-#' This methods allows to save a model in an xgboost-internal binary or text format which is universal
-#' among the various xgboost interfaces. In R, the saved model file could be read-in later
-#' using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter
-#' of \code{\link{xgb.train}}.
 #'
-#' Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}}
-#' or \code{\link[base]{save}}). However, it would then only be compatible with R, and
-#' corresponding R-methods would need to be used to load it. Moreover, persisting the model with
-#' \code{\link[base]{readRDS}} or \code{\link[base]{save}}) might cause compatibility problems in
-#' future versions of XGBoost. Consult \code{\link{a-compatibility-note-for-saveRDS-save}} to learn
-#' how to persist models in a future-proof way, i.e. to make the model accessible in future
+#' This methods allows to save a model in an XGBoost-internal binary or text format which is universal
+#' among the various xgboost interfaces. In R, the saved model file could be read later
+#' using either the [xgb.load()] function or the `xgb_model` parameter of [xgb.train()].
+#'
+#' Note: a model can also be saved as an R object (e.g., by using [readRDS()]
+#' or [save()]). However, it would then only be compatible with R, and
+#' corresponding R methods would need to be used to load it. Moreover, persisting the model with
+#' [readRDS()] or [save()] might cause compatibility problems in
+#' future versions of XGBoost. Consult [a-compatibility-note-for-saveRDS-save] to learn
+#' how to persist models in a future-proof way, i.e., to make the model accessible in future
 #' releases of XGBoost.
 #'
-#' @seealso
-#' \code{\link{xgb.load}}
+#' @seealso [xgb.load()]
 #'
 #' @examples
 #' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
-#' data(agaricus.train, package='xgboost')
-#' data(agaricus.test, package='xgboost')
+#' data(agaricus.train, package = "xgboost")
+#' data(agaricus.test, package = "xgboost")
 #'
 #' ## Keep the number of threads to 1 for examples
 #' nthread <- 1
@@ -45,6 +41,7 @@
 #'
 #' train <- agaricus.train
 #' test <- agaricus.test
+#'
 #' bst <- xgb.train(
 #'   data = xgb.DMatrix(train$data, label = train$label),
 #'   max_depth = 2,
@@ -53,6 +50,7 @@
 #'   nrounds = 2,
 #'   objective = "binary:logistic"
 #' )
+#'
 #' fname <- file.path(tempdir(), "xgb.ubj")
 #' xgb.save(bst, fname)
 #' bst <- xgb.load(fname)
diff --git a/R-package/R/xgb.save.raw.R b/R-package/R/xgb.save.raw.R
index c04f06d9c941..197c0980d9ff 100644
--- a/R-package/R/xgb.save.raw.R
+++ b/R-package/R/xgb.save.raw.R
@@ -1,29 +1,34 @@
-#' Save xgboost model to R's raw vector,
-#' user can call xgb.load.raw to load the model back from raw vector
+#' Save XGBoost model to R's raw vector
 #'
-#' Save xgboost model from xgboost or xgb.train
+#' Save XGBoost model from [xgboost()] or [xgb.train()].
+#' Call [xgb.load.raw()] to load the model back from raw vector.
 #'
-#' @param model the model object.
-#' @param raw_format The format for encoding the booster.  Available options are
-#' \itemize{
-#'     \item \code{json}: Encode the booster into JSON text document.
-#'     \item \code{ubj}:  Encode the booster into Universal Binary JSON.
-#'     \item \code{deprecated}: Encode the booster into old customized binary format.
-#' }
+#' @param model The model object.
+#' @param raw_format The format for encoding the booster:
+#'   - "json": Encode the booster into JSON text document.
+#'   - "ubj":  Encode the booster into Universal Binary JSON.
+#'   - "deprecated": Encode the booster into old customized binary format.
 #'
 #' @examples
 #' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
-#' data(agaricus.train, package='xgboost')
-#' data(agaricus.test, package='xgboost')
+#' data(agaricus.train, package = "xgboost")
+#' data(agaricus.test, package = "xgboost")
 #'
-#' ## Keep the number of threads to 2 for examples
-#' nthread <- 2
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
 #' data.table::setDTthreads(nthread)
 #'
 #' train <- agaricus.train
 #' test <- agaricus.test
-#' bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
-#'                  eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
+#'
+#' bst <- xgb.train(
+#'   data = xgb.DMatrix(train$data, label = train$label),
+#'   max_depth = 2,
+#'   eta = 1,
+#'   nthread = nthread,
+#'   nrounds = 2,
+#'   objective = "binary:logistic"
+#' )
 #'
 #' raw <- xgb.save.raw(bst)
 #' bst <- xgb.load.raw(raw)
diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R
index 9ea66731bf81..20a252b9dfd5 100644
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -944,6 +944,7 @@ xgboost <- function(
   return(model)
 }
 
+#' @method print xgboost
 #' @export
 print.xgboost <- function(x, ...) {
   cat("XGBoost model object\n")
diff --git a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd
index 860f4f0c1580..6d4446f78f84 100644
--- a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd
+++ b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd
@@ -5,66 +5,77 @@
 \title{Model Serialization and Compatibility}
 \description{
 When it comes to serializing XGBoost models, it's possible to use R serializers such as
-\link{save} or \link{saveRDS} to serialize an XGBoost R model, but XGBoost also provides
+\code{\link[=save]{save()}} or \code{\link[=saveRDS]{saveRDS()}} to serialize an XGBoost R model, but XGBoost also provides
 its own serializers with better compatibility guarantees, which allow loading
 said models in other language bindings of XGBoost.
 
-Note that an \code{xgb.Booster} object, outside of its core components, might also keep:\itemize{
-\item Additional model configuration (accessible through \link{xgb.config}),
-which includes model fitting parameters like \code{max_depth} and runtime parameters like \code{nthread}.
+Note that an \code{xgb.Booster} object, outside of its core components, might also keep:
+\itemize{
+\item Additional model configuration (accessible through \code{\link[=xgb.config]{xgb.config()}}), which includes
+model fitting parameters like \code{max_depth} and runtime parameters like \code{nthread}.
 These are not necessarily useful for prediction/importance/plotting.
-\item Additional R-specific attributes  - e.g. results of callbacks, such as evaluation logs,
-which are kept as a \code{data.table} object, accessible through \code{attributes(model)$evaluation_log}
-if present.
+\item Additional R specific attributes  - e.g. results of callbacks, such as evaluation logs,
+which are kept as a \code{data.table} object, accessible through
+\code{attributes(model)$evaluation_log} if present.
 }
 
 The first one (configurations) does not have the same compatibility guarantees as
-the model itself, including attributes that are set and accessed through \link{xgb.attributes} - that is, such configuration
-might be lost after loading the booster in a different XGBoost version, regardless of the
-serializer that was used. These are saved when using \link{saveRDS}, but will be discarded
-if loaded into an incompatible XGBoost version. They are not saved when using XGBoost's
-serializers from its public interface including \link{xgb.save} and \link{xgb.save.raw}.
+the model itself, including attributes that are set and accessed through
+\code{\link[=xgb.attributes]{xgb.attributes()}} - that is, such configuration might be lost after loading the
+booster in a different XGBoost version, regardless of the serializer that was used.
+These are saved when using \code{\link[=saveRDS]{saveRDS()}}, but will be discarded if loaded into an
+incompatible XGBoost version. They are not saved when using XGBoost's
+serializers from its public interface including \code{\link[=xgb.save]{xgb.save()}} and \code{\link[=xgb.save.raw]{xgb.save.raw()}}.
 
-The second ones (R attributes) are not part of the standard XGBoost model structure, and thus are
-not saved when using XGBoost's own serializers. These attributes are only used for informational
-purposes, such as keeping track of evaluation metrics as the model was fit, or saving the R
-call that produced the model, but are otherwise not used for prediction / importance / plotting / etc.
+The second ones (R attributes) are not part of the standard XGBoost model structure,
+and thus are not saved when using XGBoost's own serializers. These attributes are
+only used for informational purposes, such as keeping track of evaluation metrics as
+the model was fit, or saving the R call that produced the model, but are otherwise
+not used for prediction / importance / plotting / etc.
 These R attributes are only preserved when using R's serializers.
 
-Note that XGBoost models in R starting from version \verb{2.1.0} and onwards, and XGBoost models
-before version \verb{2.1.0}; have a very different R object structure and are incompatible with
-each other. Hence, models that were saved with R serializers live \code{saveRDS} or \code{save} before
-version \verb{2.1.0} will not work with latter \code{xgboost} versions and vice versa. Be aware that
-the structure of R model objects could in theory change again in the future, so XGBoost's serializers
+Note that XGBoost models in R starting from version \verb{2.1.0} and onwards, and
+XGBoost models before version \verb{2.1.0}; have a very different R object structure and
+are incompatible with each other. Hence, models that were saved with R serializers
+like \code{\link[=saveRDS]{saveRDS()}} or \code{\link[=save]{save()}} before version \verb{2.1.0} will not work with latter
+\code{xgboost} versions and vice versa. Be aware that the structure of R model objects
+could in theory change again in the future, so XGBoost's serializers
 should be preferred for long-term storage.
 
-Furthermore, note that using the package \code{qs} for serialization will require version 0.26 or
-higher of said package, and will have the same compatibility restrictions as R serializers.
+Furthermore, note that using the package \code{qs} for serialization will require
+version 0.26 or higher of said package, and will have the same compatibility
+restrictions as R serializers.
 }
 \details{
-Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into
+Use \code{\link[=xgb.save]{xgb.save()}} to save the XGBoost model as a stand-alone file. You may opt into
 the JSON format by specifying the JSON extension. To read the model back, use
-\code{\link{xgb.load}}.
+\code{\link[=xgb.load]{xgb.load()}}.
 
-Use \code{\link{xgb.save.raw}} to save the XGBoost model as a sequence (vector) of raw bytes
+Use \code{\link[=xgb.save.raw]{xgb.save.raw()}} to save the XGBoost model as a sequence (vector) of raw bytes
 in a future-proof manner. Future releases of XGBoost will be able to read the raw bytes and
-re-construct the corresponding model. To read the model back, use \code{\link{xgb.load.raw}}.
-The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model
+re-construct the corresponding model. To read the model back, use \code{\link[=xgb.load.raw]{xgb.load.raw()}}.
+The \code{\link[=xgb.save.raw]{xgb.save.raw()}} function is useful if you would like to persist the XGBoost model
 as part of another R object.
 
-Use \link{saveRDS} if you require the R-specific attributes that a booster might have, such
+Use \code{\link[=saveRDS]{saveRDS()}} if you require the R-specific attributes that a booster might have, such
 as evaluation logs, but note that future compatibility of such objects is outside XGBoost's
 control as it relies on R's serialization format (see e.g. the details section in
-\link{serialize} and \link{save} from base R).
+\link{serialize} and \code{\link[=save]{save()}} from base R).
 
 For more details and explanation about model persistence and archival, consult the page
 \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.
 }
 \examples{
-data(agaricus.train, package='xgboost')
-bst <- xgb.train(data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
-                 max_depth = 2, eta = 1, nthread = 2, nrounds = 2,
-                 objective = "binary:logistic")
+data(agaricus.train, package = "xgboost")
+
+bst <- xgb.train(
+  data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
+  max_depth = 2,
+  eta = 1,
+  nthread = 2,
+  nrounds = 2,
+  objective = "binary:logistic"
+)
 
 # Save as a stand-alone file; load it with xgb.load()
 fname <- file.path(tempdir(), "xgb_model.ubj")
diff --git a/R-package/man/coef.xgb.Booster.Rd b/R-package/man/coef.xgb.Booster.Rd
index 7318077bbb0f..295c766e6413 100644
--- a/R-package/man/coef.xgb.Booster.Rd
+++ b/R-package/man/coef.xgb.Booster.Rd
@@ -12,11 +12,12 @@
 \item{...}{Not used.}
 }
 \value{
-The extracted coefficients:\itemize{
-\item If there's only one coefficient per column in the data, will be returned as a
+The extracted coefficients:
+\itemize{
+\item If there is only one coefficient per column in the data, will be returned as a
 vector, potentially containing the feature names if available, with the intercept
 as first column.
-\item If there's more than one coefficient per column in the data (e.g. when using
+\item If there is more than one coefficient per column in the data (e.g. when using
 \code{objective="multi:softmax"}), will be returned as a matrix with dimensions equal
 to \verb{[num_features, num_cols]}, with the intercepts as first row. Note that the column
 (classes in multi-class classification) dimension will not be named.
@@ -33,16 +34,19 @@ coefficients as used by \link{predict.xgb.Booster}.
 }
 \description{
 Extracts the coefficients from a 'gblinear' booster object,
-as produced by \code{xgb.train} when using parameter \code{booster="gblinear"}.
+as produced by \code{\link[=xgb.train]{xgb.train()}} when using parameter \code{booster="gblinear"}.
 
 Note: this function will error out if passing a booster model
 which is not of "gblinear" type.
 }
 \examples{
 library(xgboost)
+
 data(mtcars)
+
 y <- mtcars[, 1]
 x <- as.matrix(mtcars[, -1])
+
 dm <- xgb.DMatrix(data = x, label = y, nthread = 1)
 params <- list(booster = "gblinear", nthread = 1)
 model <- xgb.train(data = dm, params = params, nrounds = 2)
diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd
index b7a2effee9f7..79b9a4bcee3f 100644
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -28,35 +28,36 @@
 \item{newdata}{Takes \code{data.frame}, \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector},
 local data file, or \code{xgb.DMatrix}.
 
-\if{html}{\out{<div class="sourceCode">}}\preformatted{   For single-row predictions on sparse data, it's recommended to use CSR format. If passing
-   a sparse vector, it will take it as a row vector.
-
-   Note that, for repeated predictions on the same data, one might want to create a DMatrix to
-   pass here instead of passing R types like matrices or data frames, as predictions will be
-   faster on DMatrix.
-
-   If `newdata` is a `data.frame`, be aware that:\\itemize\{
-   \\item Columns will be converted to numeric if they aren't already, which could potentially make
-         the operation slower than in an equivalent `matrix` object.
-   \\item The order of the columns must match with that of the data from which the model was fitted
-         (i.e. columns will not be referenced by their names, just by their order in the data).
-   \\item If the model was fitted to data with categorical columns, these columns must be of
-         `factor` type here, and must use the same encoding (i.e. have the same levels).
-   \\item If `newdata` contains any `factor` columns, they will be converted to base-0
-         encoding (same as during DMatrix creation) - hence, one should not pass a `factor`
-         under a column which during training had a different type.
-   \}
-}\if{html}{\out{</div>}}}
-
-\item{missing}{Float value that represents missing values in data (e.g., 0 or some other extreme value).
-
-\if{html}{\out{<div class="sourceCode">}}\preformatted{   This parameter is not used when `newdata` is an `xgb.DMatrix` - in such cases, should pass
-   this as an argument to the DMatrix constructor instead.
-}\if{html}{\out{</div>}}}
-
-\item{outputmargin}{Whether the prediction should be returned in the form of original untransformed
-sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
-logistic regression would return log-odds instead of probabilities.}
+For single-row predictions on sparse data, it is recommended to use CSR format. If passing
+a sparse vector, it will take it as a row vector.
+
+Note that, for repeated predictions on the same data, one might want to create a DMatrix to
+pass here instead of passing R types like matrices or data frames, as predictions will be
+faster on DMatrix.
+
+If \code{newdata} is a \code{data.frame}, be aware that:
+\itemize{
+\item Columns will be converted to numeric if they aren't already, which could potentially make
+the operation slower than in an equivalent \code{matrix} object.
+\item The order of the columns must match with that of the data from which the model was fitted
+(i.e. columns will not be referenced by their names, just by their order in the data).
+\item If the model was fitted to data with categorical columns, these columns must be of
+\code{factor} type here, and must use the same encoding (i.e. have the same levels).
+\item If \code{newdata} contains any \code{factor} columns, they will be converted to base-0
+encoding (same as during DMatrix creation) - hence, one should not pass a \code{factor}
+under a column which during training had a different type.
+}}
+
+\item{missing}{Float value that represents missing values in data
+(e.g., 0 or some other extreme value).
+
+This parameter is not used when \code{newdata} is an \code{xgb.DMatrix} - in such cases,
+should pass this as an argument to the DMatrix constructor instead.}
+
+\item{outputmargin}{Whether the prediction should be returned in the form of
+original untransformed sum of predictions from boosting iterations' results.
+E.g., setting \code{outputmargin = TRUE} for logistic regression would return log-odds
+instead of probabilities.}
 
 \item{predleaf}{Whether to predict per-tree leaf indices.}
 
@@ -73,60 +74,55 @@ training predicting will perform dropout.}
 a two-dimensional vector with the start and end numbers in the sequence (same format as R's \code{seq} - i.e.
 base-1 indexing, and inclusive of both ends).
 
-\if{html}{\out{<div class="sourceCode">}}\preformatted{   For example, passing `c(1,20)` will predict using the first twenty iterations, while passing `c(1,1)` will
-   predict using only the first one.
+For example, passing \code{c(1,20)} will predict using the first twenty iterations, while passing \code{c(1,1)} will
+predict using only the first one.
 
-   If passing `NULL`, will either stop at the best iteration if the model used early stopping, or use all
-   of the iterations (rounds) otherwise.
+If passing \code{NULL}, will either stop at the best iteration if the model used early stopping, or use all
+of the iterations (rounds) otherwise.
 
-   If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
-}\if{html}{\out{</div>}}}
+If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.}
 
 \item{strict_shape}{Whether to always return an array with the same dimensions for the given prediction mode
 regardless of the model type - meaning that, for example, both a multi-class and a binary classification
 model would generate output arrays with the same number of dimensions, with the 'class' dimension having
 size equal to '1' for the binary model.
 
-\if{html}{\out{<div class="sourceCode">}}\preformatted{   If passing `FALSE` (the default), dimensions will be simplified according to the model type, so that a
-   binary classification model for example would not have a redundant dimension for 'class'.
+If passing \code{FALSE} (the default), dimensions will be simplified according to the model type, so that a
+binary classification model for example would not have a redundant dimension for 'class'.
 
-   See documentation for the return type for the exact shape of the output arrays for each prediction mode.
-}\if{html}{\out{</div>}}}
+See documentation for the return type for the exact shape of the output arrays for each prediction mode.}
 
 \item{avoid_transpose}{Whether to output the resulting predictions in the same memory layout in which they
 are generated by the core XGBoost library, without transposing them to match the expected output shape.
 
-\if{html}{\out{<div class="sourceCode">}}\preformatted{   Internally, XGBoost uses row-major order for the predictions it generates, while R arrays use column-major
-   order, hence the result needs to be transposed in order to have the expected shape when represented as
-   an R array or matrix, which might be a slow operation.
+Internally, XGBoost uses row-major order for the predictions it generates, while R arrays use column-major
+order, hence the result needs to be transposed in order to have the expected shape when represented as
+an R array or matrix, which might be a slow operation.
 
-   If passing `TRUE`, then the result will have dimensions in reverse order - for example, rows
-   will be the last dimensions instead of the first dimension.
-}\if{html}{\out{</div>}}}
+If passing \code{TRUE}, then the result will have dimensions in reverse order - for example, rows
+will be the last dimensions instead of the first dimension.}
 
-\item{validate_features}{When \code{TRUE}, validate that the Booster's and newdata's feature_names
-match (only applicable when both \code{object} and \code{newdata} have feature names).
+\item{validate_features}{When \code{TRUE}, validate that the Booster's and newdata's
+feature_names match (only applicable when both \code{object} and \code{newdata} have feature names).
 
-\if{html}{\out{<div class="sourceCode">}}\preformatted{   If the column names differ and `newdata` is not an `xgb.DMatrix`, will try to reorder
-   the columns in `newdata` to match with the booster's.
+If the column names differ and \code{newdata} is not an \code{xgb.DMatrix}, will try to reorder
+the columns in \code{newdata} to match with the booster's.
 
-   If the booster has feature types and `newdata` is either an `xgb.DMatrix` or `data.frame`,
-   will additionally verify that categorical columns are of the correct type in `newdata`,
-   throwing an error if they do not match.
+If the booster has feature types and \code{newdata} is either an \code{xgb.DMatrix} or
+\code{data.frame}, will additionally verify that categorical columns are of the
+correct type in \code{newdata}, throwing an error if they do not match.
 
-   If passing `FALSE`, it is assumed that the feature names and types are the same,
-   and come in the same order as in the training data.
+If passing \code{FALSE}, it is assumed that the feature names and types are the same,
+and come in the same order as in the training data.
 
-   Note that this check might add some sizable latency to the predictions, so it's
-   recommended to disable it for performance-sensitive applications.
-}\if{html}{\out{</div>}}}
+Note that this check might add some sizable latency to the predictions, so it's
+recommended to disable it for performance-sensitive applications.}
 
 \item{base_margin}{Base margin used for boosting from existing model.
 
-\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note that, if `newdata` is an `xgb.DMatrix` object, this argument will
-   be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
-   an argument in its constructor, or by calling \link{setinfo.xgb.DMatrix}).
-}\if{html}{\out{</div>}}}
+Note that, if \code{newdata} is an \code{xgb.DMatrix} object, this argument will
+be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
+an argument in its constructor, or by calling \code{\link[=setinfo.xgb.DMatrix]{setinfo.xgb.DMatrix()}}.}
 
 \item{...}{Not used.}
 }
@@ -173,7 +169,7 @@ example, for \code{predinteraction}, they will be \verb{[nfeats+1, nfeats+1, ngr
 instead of \verb{[nrows, ngroups, nfeats+1, nfeats+1]}.
 }
 \description{
-Predict values on data based on xgboost model.
+Predict values on data based on XGBoost model.
 }
 \details{
 Note that \code{iterationrange} would currently do nothing for predictions from "gblinear",
diff --git a/R-package/man/print.xgb.Booster.Rd b/R-package/man/print.xgb.Booster.Rd
index fc055318cd01..a1e1e7f7226b 100644
--- a/R-package/man/print.xgb.Booster.Rd
+++ b/R-package/man/print.xgb.Booster.Rd
@@ -33,5 +33,4 @@ bst <- xgb.train(
 attr(bst, "myattr") <- "memo"
 
 print(bst)
-
 }
diff --git a/R-package/man/variable.names.xgb.Booster.Rd b/R-package/man/variable.names.xgb.Booster.Rd
index aec09751d8a0..6a27140de074 100644
--- a/R-package/man/variable.names.xgb.Booster.Rd
+++ b/R-package/man/variable.names.xgb.Booster.Rd
@@ -13,8 +13,8 @@
 }
 \description{
 Returns the feature / variable / column names from a fitted
-booster object, which are set automatically during the call to \link{xgb.train}
-from the DMatrix names, or which can be set manually through \link{setinfo}.
+booster object, which are set automatically during the call to \code{\link[=xgb.train]{xgb.train()}}
+from the DMatrix names, or which can be set manually through \code{\link[=setinfo]{setinfo()}}.
 
 If the object doesn't have feature names, will return \code{NULL}.
 
diff --git a/R-package/man/xgb.Callback.Rd b/R-package/man/xgb.Callback.Rd
index b4edcd97842e..b1ddf2ae398b 100644
--- a/R-package/man/xgb.Callback.Rd
+++ b/R-package/man/xgb.Callback.Rd
@@ -53,12 +53,12 @@ Return values of \code{NULL} will be interpreted as \code{FALSE}.}
 \item{f_after_training}{A function that will be executed after training is finished.
 
 This function can optionally output something non-NULL, which will become part of the R
-attributes of the booster (assuming one passes \code{keep_extra_attributes=TRUE} to \link{xgb.train})
-under the name supplied for parameter \code{cb_name} imn the case of \link{xgb.train}; or a part
-of the named elements in the result of \link{xgb.cv}.}
+attributes of the booster (assuming one passes \code{keep_extra_attributes=TRUE} to \code{\link[=xgb.train]{xgb.train()}})
+under the name supplied for parameter \code{cb_name} imn the case of \code{\link[=xgb.train]{xgb.train()}}; or a part
+of the named elements in the result of \code{\link[=xgb.cv]{xgb.cv()}}.}
 }
 \value{
-An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+An \code{xgb.Callback} object, which can be passed to \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=xgb.cv]{xgb.cv()}}.
 }
 \description{
 Constructor for defining the structure of callback functions that can be executed
@@ -66,8 +66,8 @@ at different stages of model training (before / after training, before / after e
 iteration).
 }
 \details{
-Arguments that will be passed to the supplied functions are as follows:\itemize{
-
+Arguments that will be passed to the supplied functions are as follows:
+\itemize{
 \item env The same environment that is passed under argument \code{env}.
 
 It may be modified by the functions in order to e.g. keep tracking of what happens
@@ -75,11 +75,10 @@ across iterations or similar.
 
 This environment is only used by the functions supplied to the callback, and will
 not be kept after the model fitting function terminates (see parameter \code{f_after_training}).
+\item model The booster object when using \code{\link[=xgb.train]{xgb.train()}}, or the folds when using \code{\link[=xgb.cv]{xgb.cv()}}.
 
-\item model The booster object when using \link{xgb.train}, or the folds when using
-\link{xgb.cv}.
-
-For \link{xgb.cv}, folds are a list with a structure as follows:\itemize{
+For \code{\link[=xgb.cv]{xgb.cv()}}, folds are a list with a structure as follows:
+\itemize{
 \item \code{dtrain}: The training data for the fold (as an \code{xgb.DMatrix} object).
 \item \code{bst}: Rhe \code{xgb.Booster} object for the fold.
 \item \code{evals}: A list containing two DMatrices, with names \code{train} and \code{test}
@@ -88,79 +87,71 @@ For \link{xgb.cv}, folds are a list with a structure as follows:\itemize{
 from which the \code{test} entry in \code{evals} was obtained.
 }
 
-This object should \bold{not} be in-place modified in ways that conflict with the
+This object should \strong{not} be in-place modified in ways that conflict with the
 training (e.g. resetting the parameters for a training update in a way that resets
 the number of rounds to zero in order to overwrite rounds).
 
 Note that any R attributes that are assigned to the booster during the callback functions,
 will not be kept thereafter as the booster object variable is not re-assigned during
 training. It is however possible to set C-level attributes of the booster through
-\link{xgb.attr} or \link{xgb.attributes}, which should remain available for the rest
+\code{\link[=xgb.attr]{xgb.attr()}} or \code{\link[=xgb.attributes]{xgb.attributes()}}, which should remain available for the rest
 of the iterations and after the training is done.
 
 For keeping variables across iterations, it's recommended to use \code{env} instead.
 \item data The data to which the model is being fit, as an \code{xgb.DMatrix} object.
 
-Note that, for \link{xgb.cv}, this will be the full data, while data for the specific
+Note that, for \code{\link[=xgb.cv]{xgb.cv()}}, this will be the full data, while data for the specific
 folds can be found in the \code{model} object.
+\item evals The evaluation data, as passed under argument \code{evals} to \code{\link[=xgb.train]{xgb.train()}}.
 
-\item evals The evaluation data, as passed under argument \code{evals} to
-\link{xgb.train}.
-
-For \link{xgb.cv}, this will always be \code{NULL}.
-
-\item begin_iteration Index of the first boosting iteration that will be executed
-(base-1 indexing).
+For \code{\link[=xgb.cv]{xgb.cv()}}, this will always be \code{NULL}.
+\item begin_iteration Index of the first boosting iteration that will be executed (base-1 indexing).
 
 This will typically be '1', but when using training continuation, depending on the
 parameters for updates, boosting rounds will be continued from where the previous
 model ended, in which case this will be larger than 1.
-
 \item end_iteration Index of the last boostign iteration that will be executed
 (base-1 indexing, inclusive of this end).
 
-It should match with argument \code{nrounds} passed to \link{xgb.train} or \link{xgb.cv}.
+It should match with argument \code{nrounds} passed to \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=xgb.cv]{xgb.cv()}}.
 
 Note that boosting might be interrupted before reaching this last iteration, for
 example by using the early stopping callback \link{xgb.cb.early.stop}.
-
 \item iteration Index of the iteration number that is being executed (first iteration
 will be the same as parameter \code{begin_iteration}, then next one will add +1, and so on).
-
 \item iter_feval Evaluation metrics for \code{evals} that were supplied, either
 determined by the objective, or by parameter \code{feval}.
 
-For \link{xgb.train}, this will be a named vector with one entry per element in
+For \code{\link[=xgb.train]{xgb.train()}}, this will be a named vector with one entry per element in
 \code{evals}, where the names are determined as 'evals name' + '-' + 'metric name' - for
 example, if \code{evals} contains an entry named "tr" and the metric is "rmse",
 this will be a one-element vector with name "tr-rmse".
 
-For \link{xgb.cv}, this will be a 2d matrix with dimensions \verb{[length(evals), nfolds]},
+For \code{\link[=xgb.cv]{xgb.cv()}}, this will be a 2d matrix with dimensions \verb{[length(evals), nfolds]},
 where the row names will follow the same naming logic as the one-dimensional vector
-that is passed in \link{xgb.train}.
+that is passed in \code{\link[=xgb.train]{xgb.train()}}.
 
 Note that, internally, the built-in callbacks such as \link{xgb.cb.print.evaluation} summarize
 this table by calculating the row-wise means and standard deviations.
-
 \item final_feval The evaluation results after the last boosting round is executed
 (same format as \code{iter_feval}, and will be the exact same input as passed under
 \code{iter_feval} to the last round that is executed during model fitting).
-
 \item prev_cb_res Result from a previous run of a callback sharing the same name
 (as given by parameter \code{cb_name}) when conducting training continuation, if there
 was any in the booster R attributes.
 
-Some times, one might want to append the new results to the previous one, and this will
+Sometimes, one might want to append the new results to the previous one, and this will
 be done automatically by the built-in callbacks such as \link{xgb.cb.evaluation.log},
 which will append the new rows to the previous table.
 
 If no such previous callback result is available (which it never will when fitting
 a model from start instead of updating an existing model), this will be \code{NULL}.
 
-For \link{xgb.cv}, which doesn't support training continuation, this will always be \code{NULL}.
+For \code{\link[=xgb.cv]{xgb.cv()}}, which doesn't support training continuation, this will always be \code{NULL}.
 }
 
-The following names (\code{cb_name} values) are reserved for internal callbacks:\itemize{
+The following names (\code{cb_name} values) are reserved for internal callbacks:
+\itemize{
 \item print_evaluation
 \item evaluation_log
 \item reset_parameters
@@ -170,7 +161,8 @@ The following names (\code{cb_name} values) are reserved for internal callbacks:
 \item gblinear_history
 }
 
-The following names are reserved for other non-callback attributes:\itemize{
+The following names are reserved for other non-callback attributes:
+\itemize{
 \item names
 \item class
 \item call
@@ -221,8 +213,10 @@ ssq_callback <- xgb.Callback(
 )
 
 data(mtcars)
+
 y <- mtcars$mpg
 x <- as.matrix(mtcars[, -1])
+
 dm <- xgb.DMatrix(x, label = y, nthread = 1)
 model <- xgb.train(
   data = dm,
@@ -236,7 +230,8 @@ model <- xgb.train(
 attributes(model)$ssq
 }
 \seealso{
-Built-in callbacks:\itemize{
+Built-in callbacks:
+\itemize{
 \item \link{xgb.cb.print.evaluation}
 \item \link{xgb.cb.evaluation.log}
 \item \link{xgb.cb.reset.parameters}
diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd
index 5f764ed45380..aaac3b235825 100644
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -96,8 +96,7 @@ so it doesn't make sense to assign weights to individual data points.}
 
 \item{base_margin}{Base margin used for boosting from existing model.
 
-\if{html}{\out{<div class="sourceCode">}}\preformatted{   In the case of multi-output models, one can also pass multi-dimensional base_margin.
-}\if{html}{\out{</div>}}}
+In the case of multi-output models, one can also pass multi-dimensional base_margin.}
 
 \item{missing}{A float value to represents missing values in data (not used when creating DMatrix
 from text files).
@@ -109,9 +108,8 @@ values in data.}
 \item{feature_names}{Set names for features. Overrides column names in data
 frame and matrix.
 
-\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note: columns are not referenced by name when calling `predict`, so the column order there
-   must be the same as in the DMatrix construction, regardless of the column names.
-}\if{html}{\out{</div>}}}
+Note: columns are not referenced by name when calling \code{predict}, so the column order there
+must be the same as in the DMatrix construction, regardless of the column names.}
 
 \item{feature_types}{Set types for features.
 
diff --git a/R-package/man/xgb.DataBatch.Rd b/R-package/man/xgb.DataBatch.Rd
index 063b82b031cc..63137ffd00d2 100644
--- a/R-package/man/xgb.DataBatch.Rd
+++ b/R-package/man/xgb.DataBatch.Rd
@@ -45,15 +45,13 @@ so it doesn't make sense to assign weights to individual data points.}
 
 \item{base_margin}{Base margin used for boosting from existing model.
 
-\if{html}{\out{<div class="sourceCode">}}\preformatted{   In the case of multi-output models, one can also pass multi-dimensional base_margin.
-}\if{html}{\out{</div>}}}
+In the case of multi-output models, one can also pass multi-dimensional base_margin.}
 
 \item{feature_names}{Set names for features. Overrides column names in data
 frame and matrix.
 
-\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note: columns are not referenced by name when calling `predict`, so the column order there
-   must be the same as in the DMatrix construction, regardless of the column names.
-}\if{html}{\out{</div>}}}
+Note: columns are not referenced by name when calling \code{predict}, so the column order there
+must be the same as in the DMatrix construction, regardless of the column names.}
 
 \item{feature_types}{Set types for features.
 
diff --git a/R-package/man/xgb.attr.Rd b/R-package/man/xgb.attr.Rd
index f23e9234018a..1c1607ecaa61 100644
--- a/R-package/man/xgb.attr.Rd
+++ b/R-package/man/xgb.attr.Rd
@@ -16,7 +16,7 @@ xgb.attributes(object)
 xgb.attributes(object) <- value
 }
 \arguments{
-\item{object}{Object of class \code{xgb.Booster}. \bold{Will be modified in-place} when assigning to it.}
+\item{object}{Object of class \code{xgb.Booster}. \strong{Will be modified in-place} when assigning to it.}
 
 \item{name}{A non-empty character string specifying which attribute is to be accessed.}
 
@@ -36,28 +36,28 @@ or \code{NULL} if a model has no stored attributes.
 }
 }
 \description{
-These methods allow to manipulate the key-value attribute strings of an xgboost model.
+These methods allow to manipulate the key-value attribute strings of an XGBoost model.
 }
 \details{
-The primary purpose of xgboost model attributes is to store some meta data about the model.
+The primary purpose of XGBoost model attributes is to store some meta data about the model.
 Note that they are a separate concept from the object attributes in R.
-Specifically, they refer to key-value strings that can be attached to an xgboost model,
+Specifically, they refer to key-value strings that can be attached to an XGBoost model,
 stored together with the model's binary representation, and accessed later
 (from R or any other interface).
 In contrast, any R attribute assigned to an R object of \code{xgb.Booster} class
-would not be saved by \code{\link[=xgb.save]{xgb.save()}} because an xgboost model is an external memory object
+would not be saved by \code{\link[=xgb.save]{xgb.save()}} because an XGBoost model is an external memory object
 and its serialization is handled externally.
-Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't
+Also, setting an attribute that has the same name as one of XGBoost's parameters wouldn't
 change the value of that parameter for a model.
 Use \code{\link[=xgb.parameters<-]{xgb.parameters<-()}} to set or change model parameters.
 
-The \verb{xgb.attributes<-} setter either updates the existing or adds one or several attributes,
+The \code{\link[=xgb.attributes<-]{xgb.attributes<-()}} setter either updates the existing or adds one or several attributes,
 but it doesn't delete the other existing attributes.
 
 Important: since this modifies the booster's C object, semantics for assignment here
 will differ from R's, as any object reference to the same booster will be modified
 too, while assignment of R attributes through \verb{attributes(model)$<attr> <- <value>}
-will follow the usual copy-on-write R semantics (see \link{xgb.copy.Booster} for an
+will follow the usual copy-on-write R semantics (see \code{\link[=xgb.copy.Booster]{xgb.copy.Booster()}} for an
 example of these behaviors).
 }
 \examples{
diff --git a/R-package/man/xgb.cb.cv.predict.Rd b/R-package/man/xgb.cb.cv.predict.Rd
index d2d9a084be13..aaef924853d9 100644
--- a/R-package/man/xgb.cb.cv.predict.Rd
+++ b/R-package/man/xgb.cb.cv.predict.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/callbacks.R
 \name{xgb.cb.cv.predict}
 \alias{xgb.cb.cv.predict}
-\title{Callback for returning cross-validation based predictions.}
+\title{Callback for returning cross-validation based predictions}
 \usage{
 xgb.cb.cv.predict(save_models = FALSE, outputmargin = FALSE)
 }
@@ -13,8 +13,8 @@ xgb.cb.cv.predict(save_models = FALSE, outputmargin = FALSE)
 parameter to \link{predict.xgb.Booster}).}
 }
 \value{
-An \code{xgb.Callback} object, which can be passed to \link{xgb.cv},
-but \bold{not} to \link{xgb.train}.
+An \code{xgb.Callback} object, which can be passed to \code{\link[=xgb.cv]{xgb.cv()}},
+but \strong{not} to \code{\link[=xgb.train]{xgb.train()}}.
 }
 \description{
 This callback function saves predictions for all of the test folds,
@@ -24,7 +24,7 @@ and also allows to save the folds' models.
 Predictions are saved inside of the \code{pred} element, which is either a vector or a matrix,
 depending on the number of prediction outputs per data row. The order of predictions corresponds
 to the order of rows in the original dataset. Note that when a custom \code{folds} list is
-provided in \code{xgb.cv}, the predictions would only be returned properly when this list is a
+provided in \code{\link[=xgb.cv]{xgb.cv()}}, the predictions would only be returned properly when this list is a
 non-overlapping list of k sets of indices, as in a standard k-fold CV. The predictions would not be
 meaningful when user-provided folds have overlapping indices as in, e.g., random sampling splits.
 When some of the indices in the training dataset are not included into user-provided \code{folds},
diff --git a/R-package/man/xgb.cb.early.stop.Rd b/R-package/man/xgb.cb.early.stop.Rd
index 2a70f4943d92..2833dbac1630 100644
--- a/R-package/man/xgb.cb.early.stop.Rd
+++ b/R-package/man/xgb.cb.early.stop.Rd
@@ -23,7 +23,7 @@ stopping. If not set, the last column would be used.
 Let's say the test data in \code{evals} was labelled as \code{dtest},
 and one wants to use the AUC in test data for early stopping regardless of where
 it is in the \code{evals}, then one of the following would need to be set:
-\code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
+\code{metric_name = 'dtest-auc'} or \code{metric_name = 'dtest_auc'}.
 All dash '-' characters in metric names are considered equivalent to '_'.}
 
 \item{verbose}{Whether to print the early stopping information.}
@@ -33,7 +33,7 @@ in the resulting object. If passing \code{FALSE}, will only keep the boosting ro
 up to the detected best iteration, discarding the ones that come after.}
 }
 \value{
-An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+An \code{xgb.Callback} object, which can be passed to \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=xgb.cv]{xgb.cv()}}.
 }
 \description{
 This callback function determines the condition for early stopping.
@@ -49,7 +49,7 @@ The same values are also stored as R attributes as a result of the callback, plu
 attribute \code{stopped_by_max_rounds} which indicates whether an early stopping by the \code{stopping_rounds}
 condition occurred. Note that the \code{best_iteration} that is stored under R attributes will follow
 base-1 indexing, so it will be larger by '1' than the C-level 'best_iteration' that is accessed
-through \link{xgb.attr} or \link{xgb.attributes}.
+through \code{\link[=xgb.attr]{xgb.attr()}} or  \code{\link[=xgb.attributes]{xgb.attributes()}}.
 
 At least one dataset is required in \code{evals} for early stopping to work.
 }
diff --git a/R-package/man/xgb.cb.evaluation.log.Rd b/R-package/man/xgb.cb.evaluation.log.Rd
index 4cc6ef636c66..037dc7cbc2f4 100644
--- a/R-package/man/xgb.cb.evaluation.log.Rd
+++ b/R-package/man/xgb.cb.evaluation.log.Rd
@@ -7,14 +7,14 @@
 xgb.cb.evaluation.log()
 }
 \value{
-An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+An \code{xgb.Callback} object, which can be passed to \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=xgb.cv]{xgb.cv()}}.
 }
 \description{
 Callback for logging the evaluation history
 }
 \details{
 This callback creates a table with per-iteration evaluation metrics (see parameters
-\code{evals} and \code{feval} in \link{xgb.train}).
+\code{evals} and \code{feval} in \code{\link[=xgb.train]{xgb.train()}}).
 
 Note: in the column names of the final data.table, the dash '-' character is replaced with
 the underscore '_' in order to make the column names more like regular R identifiers.
diff --git a/R-package/man/xgb.cb.gblinear.history.Rd b/R-package/man/xgb.cb.gblinear.history.Rd
index 0ebaa4685030..c2b7709aac62 100644
--- a/R-package/man/xgb.cb.gblinear.history.Rd
+++ b/R-package/man/xgb.cb.gblinear.history.Rd
@@ -7,13 +7,13 @@
 xgb.cb.gblinear.history(sparse = FALSE)
 }
 \arguments{
-\item{sparse}{when set to \code{FALSE}/\code{TRUE}, a dense/sparse matrix is used to store the result.
+\item{sparse}{When set to \code{FALSE}/\code{TRUE}, a dense/sparse matrix is used to store the result.
 Sparse format is useful when one expects only a subset of coefficients to be non-zero,
 when using the "thrifty" feature selector with fairly small number of top features
 selected per iteration.}
 }
 \value{
-An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+An \code{xgb.Callback} object, which can be passed to \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=xgb.cv]{xgb.cv()}}.
 }
 \description{
 Callback for collecting coefficients history of a gblinear booster
@@ -37,11 +37,10 @@ will have column names matching with the feature names, otherwise (when there's
 one coefficient per feature) the names will be composed as 'column name' + ':' + 'class index'
 (so e.g. column 'c1' for class '0' will be named 'c1:0').
 
-With \code{xgb.train}, the output is either a dense or a sparse matrix.
-With with \code{xgb.cv}, it is a list (one element per each fold) of such
-matrices.
+With \code{\link[=xgb.train]{xgb.train()}}, the output is either a dense or a sparse matrix.
+With with \code{\link[=xgb.cv]{xgb.cv()}}, it is a list (one element per each fold) of such matrices.
 
-Function \link{xgb.gblinear.history} function provides an easy way to retrieve the
+Function \link{xgb.gblinear.history} provides an easy way to retrieve the
 outputs from this callback.
 }
 \examples{
@@ -53,57 +52,109 @@ data.table::setDTthreads(nthread)
 
 # In the iris dataset, it is hard to linearly separate Versicolor class from the rest
 # without considering the 2nd order interactions:
-x <- model.matrix(Species ~ .^2, iris)[,-1]
+x <- model.matrix(Species ~ .^2, iris)[, -1]
 colnames(x)
-dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = nthread)
-param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
-              lambda = 0.0003, alpha = 0.0003, nthread = nthread)
+dtrain <- xgb.DMatrix(
+  scale(x),
+  label = 1 * (iris$Species == "versicolor"),
+  nthread = nthread
+)
+param <- list(
+  booster = "gblinear",
+  objective = "reg:logistic",
+  eval_metric = "auc",
+  lambda = 0.0003,
+  alpha = 0.0003,
+  nthread = nthread
+)
+
 # For 'shotgun', which is a default linear updater, using high eta values may result in
 # unstable behaviour in some datasets. With this simple dataset, however, the high learning
 # rate does not break the convergence, but allows us to illustrate the typical pattern of
 # "stochastic explosion" behaviour of this lock-free algorithm at early boosting iterations.
-bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 1.,
-                 callbacks = list(xgb.cb.gblinear.history()))
+bst <- xgb.train(
+  param,
+  dtrain,
+  list(tr = dtrain),
+  nrounds = 200,
+  eta = 1.,
+  callbacks = list(xgb.cb.gblinear.history())
+)
+
 # Extract the coefficients' path and plot them vs boosting iteration number:
 coef_path <- xgb.gblinear.history(bst)
-matplot(coef_path, type = 'l')
+matplot(coef_path, type = "l")
 
 # With the deterministic coordinate descent updater, it is safer to use higher learning rates.
 # Will try the classical componentwise boosting which selects a single best feature per round:
-bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 0.8,
-                 updater = 'coord_descent', feature_selector = 'thrifty', top_k = 1,
-                 callbacks = list(xgb.cb.gblinear.history()))
-matplot(xgb.gblinear.history(bst), type = 'l')
+bst <- xgb.train(
+  param,
+  dtrain,
+  list(tr = dtrain),
+  nrounds = 200,
+  eta = 0.8,
+  updater = "coord_descent",
+  feature_selector = "thrifty",
+  top_k = 1,
+  callbacks = list(xgb.cb.gblinear.history())
+)
+matplot(xgb.gblinear.history(bst), type = "l")
 #  Componentwise boosting is known to have similar effect to Lasso regularization.
 # Try experimenting with various values of top_k, eta, nrounds,
 # as well as different feature_selectors.
 
 # For xgb.cv:
-bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 100, eta = 0.8,
-              callbacks = list(xgb.cb.gblinear.history()))
+bst <- xgb.cv(
+  param,
+  dtrain,
+  nfold = 5,
+  nrounds = 100,
+  eta = 0.8,
+  callbacks = list(xgb.cb.gblinear.history())
+)
 # coefficients in the CV fold #3
-matplot(xgb.gblinear.history(bst)[[3]], type = 'l')
+matplot(xgb.gblinear.history(bst)[[3]], type = "l")
 
 
 #### Multiclass classification:
-#
 dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread)
-param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
-              lambda = 0.0003, alpha = 0.0003, nthread = nthread)
+
+param <- list(
+  booster = "gblinear",
+  objective = "multi:softprob",
+  num_class = 3,
+  lambda = 0.0003,
+  alpha = 0.0003,
+  nthread = nthread
+)
+
 # For the default linear updater 'shotgun' it sometimes is helpful
 # to use smaller eta to reduce instability
-bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
-                 callbacks = list(xgb.cb.gblinear.history()))
+bst <- xgb.train(
+  param,
+  dtrain,
+  list(tr = dtrain),
+  nrounds = 50,
+  eta = 0.5,
+  callbacks = list(xgb.cb.gblinear.history())
+)
+
 # Will plot the coefficient paths separately for each class:
-matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
-matplot(xgb.gblinear.history(bst, class_index = 1), type = 'l')
-matplot(xgb.gblinear.history(bst, class_index = 2), type = 'l')
+matplot(xgb.gblinear.history(bst, class_index = 0), type = "l")
+matplot(xgb.gblinear.history(bst, class_index = 1), type = "l")
+matplot(xgb.gblinear.history(bst, class_index = 2), type = "l")
 
 # CV:
-bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 70, eta = 0.5,
-              callbacks = list(xgb.cb.gblinear.history(FALSE)))
+bst <- xgb.cv(
+  param,
+  dtrain,
+  nfold = 5,
+  nrounds = 70,
+  eta = 0.5,
+  callbacks = list(xgb.cb.gblinear.history(FALSE))
+)
 # 1st fold of 1st class
-matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l')
+matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = "l")
 
 }
 \seealso{
diff --git a/R-package/man/xgb.cb.print.evaluation.Rd b/R-package/man/xgb.cb.print.evaluation.Rd
index c4f2e6991278..83c1f42164e4 100644
--- a/R-package/man/xgb.cb.print.evaluation.Rd
+++ b/R-package/man/xgb.cb.print.evaluation.Rd
@@ -7,12 +7,12 @@
 xgb.cb.print.evaluation(period = 1, showsd = TRUE)
 }
 \arguments{
-\item{period}{results would be printed every number of periods}
+\item{period}{Results would be printed every number of periods.}
 
-\item{showsd}{whether standard deviations should be printed (when available)}
+\item{showsd}{Whether standard deviations should be printed (when available).}
 }
 \value{
-An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+An \code{xgb.Callback} object, which can be passed to \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=xgb.cv]{xgb.cv()}}.
 }
 \description{
 The callback function prints the result of evaluation at every \code{period} iterations.
diff --git a/R-package/man/xgb.cb.reset.parameters.Rd b/R-package/man/xgb.cb.reset.parameters.Rd
index c7e8638178ac..7805ec3207be 100644
--- a/R-package/man/xgb.cb.reset.parameters.Rd
+++ b/R-package/man/xgb.cb.reset.parameters.Rd
@@ -2,12 +2,12 @@
 % Please edit documentation in R/callbacks.R
 \name{xgb.cb.reset.parameters}
 \alias{xgb.cb.reset.parameters}
-\title{Callback for resetting the booster's parameters at each iteration.}
+\title{Callback for resetting booster parameters at each iteration}
 \usage{
 xgb.cb.reset.parameters(new_params)
 }
 \arguments{
-\item{new_params}{a list where each element corresponds to a parameter that needs to be reset.
+\item{new_params}{List of parameters needed to be reset.
 Each element's value must be either a vector of values of length \code{nrounds}
 to be set at each iteration,
 or a function of two parameters \code{learning_rates(iteration, nrounds)}
@@ -15,10 +15,10 @@ which returns a new parameter value by using the current iteration number
 and the total number of boosting rounds.}
 }
 \value{
-An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+An \code{xgb.Callback} object, which can be passed to \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=xgb.cv]{xgb.cv()}}.
 }
 \description{
-Callback for resetting the booster's parameters at each iteration.
+Callback for resetting booster parameters at each iteration
 }
 \details{
 Note that when training is resumed from some previous model, and a function is used to
diff --git a/R-package/man/xgb.cb.save.model.Rd b/R-package/man/xgb.cb.save.model.Rd
index 8ddba2f1a587..f4771541866c 100644
--- a/R-package/man/xgb.cb.save.model.Rd
+++ b/R-package/man/xgb.cb.save.model.Rd
@@ -2,23 +2,22 @@
 % Please edit documentation in R/callbacks.R
 \name{xgb.cb.save.model}
 \alias{xgb.cb.save.model}
-\title{Callback for saving a model file.}
+\title{Callback for saving a model file}
 \usage{
 xgb.cb.save.model(save_period = 0, save_name = "xgboost.ubj")
 }
 \arguments{
-\item{save_period}{Save the model to disk after every
-\code{save_period} iterations; 0 means save the model at the end.}
+\item{save_period}{Save the model to disk after every \code{save_period} iterations;
+0 means save the model at the end.}
 
 \item{save_name}{The name or path for the saved model file.
-It can contain a \code{\link[base]{sprintf}} formatting specifier
-to include the integer iteration number in the file name.
-E.g., with \code{save_name} = 'xgboost_\%04d.model',
+It can contain a \code{\link[=sprintf]{sprintf()}} formatting specifier to include the integer
+iteration number in the file name. E.g., with \code{save_name = 'xgboost_\%04d.model'},
 the file saved at iteration 50 would be named "xgboost_0050.model".}
 }
 \value{
-An \code{xgb.Callback} object, which can be passed to \link{xgb.train},
-but \bold{not} to \link{xgb.cv}.
+An \code{xgb.Callback} object, which can be passed to \code{\link[=xgb.train]{xgb.train()}},
+but \strong{not} to \code{\link[=xgb.cv]{xgb.cv()}}.
 }
 \description{
 This callback function allows to save an xgb-model file, either periodically
diff --git a/R-package/man/xgb.config.Rd b/R-package/man/xgb.config.Rd
index dbad1d8cf043..5ac223b4d8a8 100644
--- a/R-package/man/xgb.config.Rd
+++ b/R-package/man/xgb.config.Rd
@@ -10,12 +10,12 @@ xgb.config(object)
 xgb.config(object) <- value
 }
 \arguments{
-\item{object}{Object of class \code{xgb.Booster}. \bold{Will be modified in-place} when assigning to it.}
+\item{object}{Object of class \code{xgb.Booster}.\strong{Will be modified in-place} when assigning to it.}
 
-\item{value}{An R list.}
+\item{value}{A list.}
 }
 \value{
-\code{xgb.config} will return the parameters as an R list.
+Parameters as a list.
 }
 \description{
 Accessors for model parameters as JSON string
@@ -25,7 +25,7 @@ Note that assignment is performed in-place on the booster C object, which unlike
 of R attributes, doesn't follow typical copy-on-write semantics for assignment - i.e. all references
 to the same booster will also get updated.
 
-See \link{xgb.copy.Booster} for an example of this behavior.
+See \code{\link[=xgb.copy.Booster]{xgb.copy.Booster()}} for an example of this behavior.
 }
 \examples{
 data(agaricus.train, package = "xgboost")
diff --git a/R-package/man/xgb.copy.Booster.Rd b/R-package/man/xgb.copy.Booster.Rd
index 8426d039e5a0..2bab71cd2a52 100644
--- a/R-package/man/xgb.copy.Booster.Rd
+++ b/R-package/man/xgb.copy.Booster.Rd
@@ -16,14 +16,18 @@ functions called on that copy will not affect the \code{model} variable.
 \description{
 Creates a deep copy of an 'xgb.Booster' object, such that the
 C object pointer contained will be a different object, and hence functions
-like \link{xgb.attr} will not affect the object from which it was copied.
+like \code{\link[=xgb.attr]{xgb.attr()}} will not affect the object from which it was copied.
 }
 \examples{
 library(xgboost)
+
 data(mtcars)
+
 y <- mtcars$mpg
 x <- mtcars[, -1]
+
 dm <- xgb.DMatrix(x, label = y, nthread = 1)
+
 model <- xgb.train(
   data = dm,
   params = list(nthread = 1),
diff --git a/R-package/man/xgb.create.features.Rd b/R-package/man/xgb.create.features.Rd
index 995c27459a5e..282593ebd000 100644
--- a/R-package/man/xgb.create.features.Rd
+++ b/R-package/man/xgb.create.features.Rd
@@ -7,17 +7,18 @@
 xgb.create.features(model, data, ...)
 }
 \arguments{
-\item{model}{decision tree boosting model learned on the original data}
+\item{model}{Decision tree boosting model learned on the original data.}
 
-\item{data}{original data (usually provided as a \code{dgCMatrix} matrix)}
+\item{data}{Original data (usually provided as a \code{dgCMatrix} matrix).}
 
-\item{...}{currently not used}
+\item{...}{Currently not used.}
 }
 \value{
-\code{dgCMatrix} matrix including both the original data and the new features.
+A \code{dgCMatrix} matrix including both the original data and the new features.
 }
 \description{
-May improve the learning by adding new features to the training data based on the decision trees from a previously learned model.
+May improve the learning by adding new features to the training data based on the
+decision trees from a previously learned model.
 }
 \details{
 This is the function inspired from the paragraph 3.1 of the paper:
@@ -44,11 +45,11 @@ For example, consider the boosted tree model in Figure 1 with 2 subtrees,
 where the first subtree has 3 leafs and the second 2 leafs. If an
 instance ends up in leaf 2 in the first subtree and leaf 1 in
 second subtree, the overall input to the linear classifier will
-be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
+be the binary vector \verb{[0, 1, 0, 1, 0]}, where the first 3 entries
 correspond to the leaves of the first subtree and last 2 to
 those of the second subtree.
 
-\link{...}
+...
 
 We can understand boosted decision tree
 based transformation as a supervised feature encoding that
@@ -57,15 +58,16 @@ vector. A traversal from root node to a leaf node represents
 a rule on certain features."
 }
 \examples{
-data(agaricus.train, package='xgboost')
-data(agaricus.test, package='xgboost')
+data(agaricus.train, package = "xgboost")
+data(agaricus.test, package = "xgboost")
+
 dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))
 
-param <- list(max_depth=2, eta=1, objective='binary:logistic')
+param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic')
 nrounds = 4
 
-bst = xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
+bst <- xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
 
 # Model accuracy without new features
 accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) /
diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd
index 199ede1583f8..8bd8e5d6c0d6 100644
--- a/R-package/man/xgb.dump.Rd
+++ b/R-package/man/xgb.dump.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/xgb.dump.R
 \name{xgb.dump}
 \alias{xgb.dump}
-\title{Dump an xgboost model in text format.}
+\title{Dump an XGBoost model in text format.}
 \usage{
 xgb.dump(
   model,
@@ -14,43 +14,51 @@ xgb.dump(
 )
 }
 \arguments{
-\item{model}{the model object.}
+\item{model}{The model object.}
 
-\item{fname}{the name of the text file where to save the model text dump.
-If not provided or set to \code{NULL}, the model is returned as a \code{character} vector.}
+\item{fname}{The name of the text file where to save the model text dump.
+If not provided or set to \code{NULL}, the model is returned as a character vector.}
 
-\item{fmap}{feature map file representing feature types.
-See demo/ for walkthrough example in R, and
-\url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
-for example Format.}
+\item{fmap}{Feature map file representing feature types. See demo/ for a walkthrough
+example in R, and \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
+to see an example of the value.}
 
-\item{with_stats}{whether to dump some additional statistics about the splits.
+\item{with_stats}{Whether to dump some additional statistics about the splits.
 When this option is on, the model dump contains two additional values:
 gain is the approximate loss function gain we get in each split;
 cover is the sum of second order gradient in each node.}
 
-\item{dump_format}{either 'text', 'json', or 'dot' (graphviz) format could be specified.
+\item{dump_format}{Either 'text', 'json', or 'dot' (graphviz) format could be specified.
 
 Format 'dot' for a single tree can be passed directly to packages that consume this format
-for graph visualization, such as function \code{\link[DiagrammeR:grViz]{DiagrammeR::grViz()}}}
+for graph visualization, such as function \code{DiagrammeR::grViz()}}
 
-\item{...}{currently not used}
+\item{...}{Currently not used}
 }
 \value{
 If fname is not provided or set to \code{NULL} the function will return the model
-as a \code{character} vector. Otherwise it will return \code{TRUE}.
+as a character vector. Otherwise it will return \code{TRUE}.
 }
 \description{
-Dump an xgboost model in text format.
+Dump an XGBoost model in text format.
 }
 \examples{
 \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
-data(agaricus.train, package='xgboost')
-data(agaricus.test, package='xgboost')
+data(agaricus.train, package = "xgboost")
+data(agaricus.test, package = "xgboost")
+
 train <- agaricus.train
 test <- agaricus.test
-bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
-                 eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+
+bst <- xgb.train(
+  data = xgb.DMatrix(train$data, label = train$label),
+  max_depth = 2,
+  eta = 1,
+  nthread = 2,
+  nrounds = 2,
+  objective = "binary:logistic"
+)
+
 # save the model in file 'xgb.model.dump'
 dump_path = file.path(tempdir(), 'model.dump')
 xgb.dump(bst, dump_path, with_stats = TRUE)
@@ -59,7 +67,7 @@ xgb.dump(bst, dump_path, with_stats = TRUE)
 print(xgb.dump(bst, with_stats = TRUE))
 
 # print in JSON format:
-cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
+cat(xgb.dump(bst, with_stats = TRUE, dump_format = "json"))
 
 # plot first tree leveraging the 'dot' format
 if (requireNamespace('DiagrammeR', quietly = TRUE)) {
diff --git a/R-package/man/xgb.gblinear.history.Rd b/R-package/man/xgb.gblinear.history.Rd
index 25aef7163e40..776c52e2c205 100644
--- a/R-package/man/xgb.gblinear.history.Rd
+++ b/R-package/man/xgb.gblinear.history.Rd
@@ -2,24 +2,24 @@
 % Please edit documentation in R/callbacks.R
 \name{xgb.gblinear.history}
 \alias{xgb.gblinear.history}
-\title{Extract gblinear coefficients history.}
+\title{Extract gblinear coefficients history}
 \usage{
 xgb.gblinear.history(model, class_index = NULL)
 }
 \arguments{
-\item{model}{either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
-using the \link{xgb.cb.gblinear.history} callback, but \bold{not} a booster
-loaded from \link{xgb.load} or \link{xgb.load.raw}.}
+\item{model}{Either an \code{xgb.Booster} or a result of \code{\link[=xgb.cv]{xgb.cv()}}, trained
+using the \link{xgb.cb.gblinear.history} callback, but \strong{not} a booster
+loaded from \code{\link[=xgb.load]{xgb.load()}} or \code{\link[=xgb.load.raw]{xgb.load.raw()}}.}
 
 \item{class_index}{zero-based class index to extract the coefficients for only that
-specific class in a multinomial multiclass model. When it is NULL, all the
+specific class in a multinomial multiclass model. When it is \code{NULL}, all the
 coefficients are returned. Has no effect in non-multiclass models.}
 }
 \value{
-For an \link{xgb.train} result, a matrix (either dense or sparse) with the columns
+For an \code{\link[=xgb.train]{xgb.train()}} result, a matrix (either dense or sparse) with the columns
 corresponding to iteration's coefficients and the rows corresponding to boosting iterations.
 
-For an \link{xgb.cv} result, a list of such matrices is returned with the elements
+For an \code{\link[=xgb.cv]{xgb.cv()}} result, a list of such matrices is returned with the elements
 corresponding to CV folds.
 
 When there is more than one coefficient per feature (e.g. multi-class classification)
@@ -31,15 +31,15 @@ coefficients N+1 through 2N for the second class, and so on).
 \description{
 A helper function to extract the matrix of linear coefficients' history
 from a gblinear model created while using the \link{xgb.cb.gblinear.history}
-callback (which must be added manually as by default it's not used).
+callback (which must be added manually as by default it is not used).
 }
 \details{
 Note that this is an R-specific function that relies on R attributes that
-are not saved when using xgboost's own serialization functions like \link{xgb.load}
-or \link{xgb.load.raw}.
+are not saved when using XGBoost's own serialization functions like \code{\link[=xgb.load]{xgb.load()}}
+or \code{\link[=xgb.load.raw]{xgb.load.raw()}}.
 
 In order for a serialized model to be accepted by this function, one must use R
-serializers such as \link{saveRDS}.
+serializers such as \code{\link[=saveRDS]{saveRDS()}}.
 }
 \seealso{
 \link{xgb.cb.gblinear.history}, \link{coef.xgb.Booster}.
diff --git a/R-package/man/xgb.get.num.boosted.rounds.Rd b/R-package/man/xgb.get.num.boosted.rounds.Rd
index 551dc4a8376d..ba1c5e11a96b 100644
--- a/R-package/man/xgb.get.num.boosted.rounds.Rd
+++ b/R-package/man/xgb.get.num.boosted.rounds.Rd
@@ -13,13 +13,13 @@ xgb.get.num.boosted.rounds(model)
 \item{model, x}{A fitted \code{xgb.Booster} model.}
 }
 \value{
-The number of rounds saved in the model, as an integer.
+The number of rounds saved in the model as an integer.
 }
 \description{
 Get number of boosting in a fitted booster
 }
 \details{
 Note that setting booster parameters related to training
-continuation / updates through \link{xgb.parameters<-} will reset the
+continuation / updates through \code{\link[=xgb.parameters<-]{xgb.parameters<-()}} will reset the
 number of rounds to zero.
 }
diff --git a/R-package/man/xgb.is.same.Booster.Rd b/R-package/man/xgb.is.same.Booster.Rd
index d2a2f4d179d8..4ef0182077ca 100644
--- a/R-package/man/xgb.is.same.Booster.Rd
+++ b/R-package/man/xgb.is.same.Booster.Rd
@@ -12,30 +12,33 @@ xgb.is.same.Booster(obj1, obj2)
 \item{obj2}{Booster model to compare with \code{obj1}.}
 }
 \value{
-Either \code{TRUE} or \code{FALSE} according to whether the two boosters share
-the underlying C object.
+Either \code{TRUE} or \code{FALSE} according to whether the two boosters share the
+underlying C object.
 }
 \description{
 Checks whether two booster objects refer to the same underlying C object.
 }
 \details{
-As booster objects (as returned by e.g. \link{xgb.train}) contain an R 'externalptr'
+As booster objects (as returned by e.g. \code{\link[=xgb.train]{xgb.train()}}) contain an R 'externalptr'
 object, they don't follow typical copy-on-write semantics of other R objects - that is, if
 one assigns a booster to a different variable and modifies that new variable through in-place
-methods like \link{xgb.attr<-}, the modification will be applied to both the old and the new
+methods like \code{\link[=xgb.attr<-]{xgb.attr<-()}}, the modification will be applied to both the old and the new
 variable, unlike typical R assignments which would only modify the latter.
 
 This function allows checking whether two booster objects share the same 'externalptr',
 regardless of the R attributes that they might have.
 
 In order to duplicate a booster in such a way that the copy wouldn't share the same
-'externalptr', one can use function \link{xgb.copy.Booster}.
+'externalptr', one can use function \code{\link[=xgb.copy.Booster]{xgb.copy.Booster()}}.
 }
 \examples{
 library(xgboost)
+
 data(mtcars)
+
 y <- mtcars$mpg
 x <- as.matrix(mtcars[, -1])
+
 model <- xgb.train(
   params = list(nthread = 1),
   data = xgb.DMatrix(x, label = y, nthread = 1),
@@ -55,5 +58,5 @@ xgb.attr(model, "my_attr") # gets modified
 xgb.attr(model_deep_copy, "my_attr") # doesn't get modified
 }
 \seealso{
-\link{xgb.copy.Booster}
+\code{\link[=xgb.copy.Booster]{xgb.copy.Booster()}}
 }
diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd
index e18a900e3f13..bb898d6f5bfb 100644
--- a/R-package/man/xgb.load.Rd
+++ b/R-package/man/xgb.load.Rd
@@ -2,32 +2,32 @@
 % Please edit documentation in R/xgb.load.R
 \name{xgb.load}
 \alias{xgb.load}
-\title{Load xgboost model from binary file}
+\title{Load XGBoost model from binary file}
 \usage{
 xgb.load(modelfile)
 }
 \arguments{
-\item{modelfile}{the name of the binary input file.}
+\item{modelfile}{The name of the binary input file.}
 }
 \value{
 An object of \code{xgb.Booster} class.
 }
 \description{
-Load xgboost model from the binary model file.
+Load XGBoost model from binary model file.
 }
 \details{
-The input file is expected to contain a model saved in an xgboost model format
-using either \code{\link{xgb.save}} or \code{\link{xgb.cb.save.model}} in R, or using some
-appropriate methods from other xgboost interfaces. E.g., a model trained in Python and
-saved from there in xgboost format, could be loaded from R.
+The input file is expected to contain a model saved in an XGBoost model format
+using either \code{\link[=xgb.save]{xgb.save()}} in R, or using some
+appropriate methods from other XGBoost interfaces. E.g., a model trained in Python and
+saved from there in XGBoost format, could be loaded from R.
 
-Note: a model saved as an R-object, has to be loaded using corresponding R-methods,
-not \code{xgb.load}.
+Note: a model saved as an R object has to be loaded using corresponding R-methods,
+not by \code{\link[=xgb.load]{xgb.load()}}.
 }
 \examples{
 \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
-data(agaricus.train, package='xgboost')
-data(agaricus.test, package='xgboost')
+data(agaricus.train, package = "xgboost")
+data(agaricus.test, package = "xgboost")
 
 ## Keep the number of threads to 1 for examples
 nthread <- 1
@@ -35,6 +35,7 @@ data.table::setDTthreads(nthread)
 
 train <- agaricus.train
 test <- agaricus.test
+
 bst <- xgb.train(
   data = xgb.DMatrix(train$data, label = train$label),
   max_depth = 2,
@@ -49,5 +50,5 @@ xgb.save(bst, fname)
 bst <- xgb.load(fname)
 }
 \seealso{
-\code{\link{xgb.save}}
+\code{\link[=xgb.save]{xgb.save()}}
 }
diff --git a/R-package/man/xgb.load.raw.Rd b/R-package/man/xgb.load.raw.Rd
index f0248cd9e002..c8543c462d4a 100644
--- a/R-package/man/xgb.load.raw.Rd
+++ b/R-package/man/xgb.load.raw.Rd
@@ -2,13 +2,13 @@
 % Please edit documentation in R/xgb.load.raw.R
 \name{xgb.load.raw}
 \alias{xgb.load.raw}
-\title{Load serialised xgboost model from R's raw vector}
+\title{Load serialised XGBoost model from R's raw vector}
 \usage{
 xgb.load.raw(buffer)
 }
 \arguments{
-\item{buffer}{the buffer returned by xgb.save.raw}
+\item{buffer}{The buffer returned by \code{\link[=xgb.save.raw]{xgb.save.raw()}}.}
 }
 \description{
-User can generate raw memory buffer by calling xgb.save.raw
+User can generate raw memory buffer by calling \code{\link[=xgb.save.raw]{xgb.save.raw()}}.
 }
diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd
index e9536767986c..97533c883874 100644
--- a/R-package/man/xgb.model.dt.tree.Rd
+++ b/R-package/man/xgb.model.dt.tree.Rd
@@ -13,15 +13,14 @@ xgb.model.dt.tree(
 )
 }
 \arguments{
-\item{model}{Object of class \code{xgb.Booster}. If it contains feature names (they can be set through
-\link{setinfo}), they will be used in the output from this function.}
+\item{model}{Object of class \code{xgb.Booster}. If it contains feature names (they can
+be set through \code{\link[=setinfo]{setinfo()}}), they will be used in the output from this function.}
 
 \item{text}{Character vector previously generated by the function \code{\link[=xgb.dump]{xgb.dump()}}
 (called with parameter \code{with_stats = TRUE}). \code{text} takes precedence over \code{model}.}
 
-\item{trees}{An integer vector of tree indices that should be used.
-The default (\code{NULL}) uses all trees.
-Useful, e.g., in multiclass classification to get only
+\item{trees}{An integer vector of tree indices that should be used. The default
+(\code{NULL}) uses all trees. Useful, e.g., in multiclass classification to get only
 the trees of one class. \emph{Important}: the tree index in XGBoost models
 is zero-based (e.g., use \code{trees = 0:4} for the first five trees).}
 
diff --git a/R-package/man/xgb.parameters.Rd b/R-package/man/xgb.parameters.Rd
index 82977dc122d4..65426792e0dd 100644
--- a/R-package/man/xgb.parameters.Rd
+++ b/R-package/man/xgb.parameters.Rd
@@ -7,7 +7,7 @@
 xgb.parameters(object) <- value
 }
 \arguments{
-\item{object}{Object of class \code{xgb.Booster}. \bold{Will be modified in-place}.}
+\item{object}{Object of class \code{xgb.Booster}. \strong{Will be modified in-place}.}
 
 \item{value}{A list (or an object coercible to a list) with the names of parameters to set
 and the elements corresponding to parameter values.}
@@ -16,21 +16,22 @@ and the elements corresponding to parameter values.}
 The same booster \code{object}, which gets modified in-place.
 }
 \description{
-Only the setter for xgboost parameters is currently implemented.
+Only the setter for XGBoost parameters is currently implemented.
 }
 \details{
-Just like \link{xgb.attr}, this function will make in-place modifications
+Just like \code{\link[=xgb.attr]{xgb.attr()}}, this function will make in-place modifications
 on the booster object which do not follow typical R assignment semantics - that is,
 all references to the same booster will also be updated, unlike assingment of R
 attributes which follow copy-on-write semantics.
 
-See \link{xgb.copy.Booster} for an example of this behavior.
+See \code{\link[=xgb.copy.Booster]{xgb.copy.Booster()}} for an example of this behavior.
 
 Be aware that setting parameters of a fitted booster related to training continuation / updates
 will reset its number of rounds indicator to zero.
 }
 \examples{
 data(agaricus.train, package = "xgboost")
+
 train <- agaricus.train
 
 bst <- xgb.train(
diff --git a/R-package/man/xgb.plot.deepness.Rd b/R-package/man/xgb.plot.deepness.Rd
index 3da8e384e4a1..536bb98c8436 100644
--- a/R-package/man/xgb.plot.deepness.Rd
+++ b/R-package/man/xgb.plot.deepness.Rd
@@ -18,7 +18,8 @@ xgb.plot.deepness(
 )
 }
 \arguments{
-\item{model}{Either an \code{xgb.Booster} model, or the "data.table" returned by \code{\link[=xgb.model.dt.tree]{xgb.model.dt.tree()}}.}
+\item{model}{Either an \code{xgb.Booster} model, or the "data.table" returned
+by \code{\link[=xgb.model.dt.tree]{xgb.model.dt.tree()}}.}
 
 \item{which}{Which distribution to plot (see details).}
 
diff --git a/R-package/man/xgb.plot.importance.Rd b/R-package/man/xgb.plot.importance.Rd
index a9ebcbd2732a..6b26bec2a86d 100644
--- a/R-package/man/xgb.plot.importance.Rd
+++ b/R-package/man/xgb.plot.importance.Rd
@@ -67,8 +67,8 @@ Represents previously calculated feature importance as a bar graph.
 }
 }
 \details{
-The graph represents each feature as a horizontal bar of length proportional to the importance of a feature.
-Features are sorted by decreasing importance.
+The graph represents each feature as a horizontal bar of length proportional to the
+importance of a feature. Features are sorted by decreasing importance.
 It works for both "gblinear" and "gbtree" models.
 
 When \code{rel_to_first = FALSE}, the values would be plotted as in \code{importance_matrix}.
diff --git a/R-package/man/xgb.plot.multi.trees.Rd b/R-package/man/xgb.plot.multi.trees.Rd
index eae84d98edfd..6355d4043e85 100644
--- a/R-package/man/xgb.plot.multi.trees.Rd
+++ b/R-package/man/xgb.plot.multi.trees.Rd
@@ -21,11 +21,11 @@ xgb.plot.multi.trees(
 by default 5.}
 
 \item{plot_width, plot_height}{Width and height of the graph in pixels.
-The values are passed to \code{\link[DiagrammeR:render_graph]{DiagrammeR::render_graph()}}.}
+The values are passed to \code{DiagrammeR::render_graph()}.}
 
 \item{render}{Should the graph be rendered or not? The default is \code{TRUE}.}
 
-\item{...}{currently not used.}
+\item{...}{Currently not used.}
 }
 \value{
 The value depends on the \code{render} parameter:
@@ -35,7 +35,7 @@ class \code{grViz}. Similar to "ggplot" objects, it needs to be printed when not
 running from the command line.
 \item If \code{render = FALSE}: Graph object which is of DiagrammeR's class \code{dgr_graph}.
 This could be useful if one wants to modify some of the graph attributes
-before rendering the graph with \code{\link[DiagrammeR:render_graph]{DiagrammeR::render_graph()}}.
+before rendering the graph with \code{DiagrammeR::render_graph()}.
 }
 }
 \description{
diff --git a/R-package/man/xgb.plot.shap.Rd b/R-package/man/xgb.plot.shap.Rd
index f2d2ea2a05e6..c94fb2bb34c4 100644
--- a/R-package/man/xgb.plot.shap.Rd
+++ b/R-package/man/xgb.plot.shap.Rd
@@ -38,12 +38,11 @@ xgb.plot.shap(
 \item{shap_contrib}{Matrix of SHAP contributions of \code{data}.
 The default (\code{NULL}) computes it from \code{model} and \code{data}.}
 
-\item{features}{Vector of column indices or feature names to plot.
-When \code{NULL} (default), the \code{top_n} most important features are selected
-by \code{\link[=xgb.importance]{xgb.importance()}}.}
+\item{features}{Vector of column indices or feature names to plot. When \code{NULL}
+(default), the \code{top_n} most important features are selected by \code{\link[=xgb.importance]{xgb.importance()}}.}
 
 \item{top_n}{How many of the most important features (<= 100) should be selected?
-By default 1 for SHAP dependence and 10 for SHAP summary).
+By default 1 for SHAP dependence and 10 for SHAP summary.
 Only used when \code{features = NULL}.}
 
 \item{model}{An \code{xgb.Booster} model. Only required when \code{shap_contrib = NULL} or
@@ -173,6 +172,7 @@ mbst <- xgb.train(
 )
 trees0 <- seq(from = 0, by = nclass, length.out = nrounds)
 col <- rgb(0, 0, 1, 0.5)
+
 xgb.plot.shap(
   x,
   model = mbst,
diff --git a/R-package/man/xgb.plot.shap.summary.Rd b/R-package/man/xgb.plot.shap.summary.Rd
index b0ad20dd790a..7fbca6fd9c10 100644
--- a/R-package/man/xgb.plot.shap.summary.Rd
+++ b/R-package/man/xgb.plot.shap.summary.Rd
@@ -35,12 +35,11 @@ xgb.plot.shap.summary(
 \item{shap_contrib}{Matrix of SHAP contributions of \code{data}.
 The default (\code{NULL}) computes it from \code{model} and \code{data}.}
 
-\item{features}{Vector of column indices or feature names to plot.
-When \code{NULL} (default), the \code{top_n} most important features are selected
-by \code{\link[=xgb.importance]{xgb.importance()}}.}
+\item{features}{Vector of column indices or feature names to plot. When \code{NULL}
+(default), the \code{top_n} most important features are selected by \code{\link[=xgb.importance]{xgb.importance()}}.}
 
 \item{top_n}{How many of the most important features (<= 100) should be selected?
-By default 1 for SHAP dependence and 10 for SHAP summary).
+By default 1 for SHAP dependence and 10 for SHAP summary.
 Only used when \code{features = NULL}.}
 
 \item{model}{An \code{xgb.Booster} model. Only required when \code{shap_contrib = NULL} or
diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd
index 6064107fc184..993a0d772acc 100644
--- a/R-package/man/xgb.plot.tree.Rd
+++ b/R-package/man/xgb.plot.tree.Rd
@@ -26,13 +26,14 @@ the trees of one class. \emph{Important}: the tree index in XGBoost models
 is zero-based (e.g., use \code{trees = 0:2} for the first three trees).}
 
 \item{plot_width, plot_height}{Width and height of the graph in pixels.
-The values are passed to \code{\link[DiagrammeR:render_graph]{DiagrammeR::render_graph()}}.}
+The values are passed to \code{DiagrammeR::render_graph()}.}
 
 \item{render}{Should the graph be rendered or not? The default is \code{TRUE}.}
 
 \item{show_node_id}{a logical flag for whether to show node id's in the graph.}
 
-\item{style}{Style to use for the plot. Options are:\itemize{
+\item{style}{Style to use for the plot:
+\itemize{
 \item \code{"xgboost"}: will use the plot style defined in the core XGBoost library,
 which is shared between different interfaces through the 'dot' format. This
 style was not available before version 2.1.0 in R. It always plots the trees
@@ -42,13 +43,14 @@ the introducition of the standardized style from the core library. It might plot
 the trees horizontally (from left to right).
 }
 
-Note that \code{style="xgboost"} is only supported when all of the following conditions are met:\itemize{
+Note that \code{style="xgboost"} is only supported when all of the following conditions are met:
+\itemize{
 \item Only a single tree is being plotted.
 \item Node IDs are not added to the graph.
 \item The graph is being returned as \code{htmlwidget} (\code{render=TRUE}).
 }}
 
-\item{...}{currently not used.}
+\item{...}{Currently not used.}
 }
 \value{
 The value depends on the \code{render} parameter:
@@ -58,7 +60,7 @@ class \code{grViz}. Similar to "ggplot" objects, it needs to be printed when not
 running from the command line.
 \item If \code{render = FALSE}: Graph object which is of DiagrammeR's class \code{dgr_graph}.
 This could be useful if one wants to modify some of the graph attributes
-before rendering the graph with \code{\link[DiagrammeR:render_graph]{DiagrammeR::render_graph()}}.
+before rendering the graph with \code{DiagrammeR::render_graph()}.
 }
 }
 \description{
diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd
index bcfbd0bb4520..ec9ab63f717c 100644
--- a/R-package/man/xgb.save.Rd
+++ b/R-package/man/xgb.save.Rd
@@ -2,21 +2,20 @@
 % Please edit documentation in R/xgb.save.R
 \name{xgb.save}
 \alias{xgb.save}
-\title{Save xgboost model to binary file}
+\title{Save XGBoost model to binary file}
 \usage{
 xgb.save(model, fname)
 }
 \arguments{
 \item{model}{Model object of \code{xgb.Booster} class.}
 
-\item{fname}{Name of the file to write.
-
-Note that the extension of this file name determined the serialization format to use:\itemize{
-\item Extension ".ubj" will use the universal binary JSON format (recommended).
+\item{fname}{Name of the file to write. Its extension determines the serialization format:
+\itemize{
+\item ".ubj": Use the universal binary JSON format (recommended).
 This format uses binary types for e.g. floating point numbers, thereby preventing any loss
 of precision when converting to a human-readable JSON text or similar.
-\item Extension ".json" will use plain JSON, which is a human-readable format.
-\item Extension ".deprecated" will use a \bold{deprecated} binary format. This format will
+\item ".json": Use plain JSON, which is a human-readable format.
+\item ".deprecated": Use \strong{deprecated} binary format. This format will
 not be able to save attributes introduced after v1 of XGBoost, such as the "best_iteration"
 attribute that boosters might keep, nor feature names or user-specifiec attributes.
 \item If the format is not specified by passing one of the file extensions above, will
@@ -24,26 +23,25 @@ default to UBJ.
 }}
 }
 \description{
-Save xgboost model to a file in binary or JSON format.
+Save XGBoost model to a file in binary or JSON format.
 }
 \details{
-This methods allows to save a model in an xgboost-internal binary or text format which is universal
-among the various xgboost interfaces. In R, the saved model file could be read-in later
-using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter
-of \code{\link{xgb.train}}.
+This methods allows to save a model in an XGBoost-internal binary or text format which is universal
+among the various xgboost interfaces. In R, the saved model file could be read later
+using either the \code{\link[=xgb.load]{xgb.load()}} function or the \code{xgb_model} parameter of \code{\link[=xgb.train]{xgb.train()}}.
 
-Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}}
-or \code{\link[base]{save}}). However, it would then only be compatible with R, and
-corresponding R-methods would need to be used to load it. Moreover, persisting the model with
-\code{\link[base]{readRDS}} or \code{\link[base]{save}}) might cause compatibility problems in
-future versions of XGBoost. Consult \code{\link{a-compatibility-note-for-saveRDS-save}} to learn
-how to persist models in a future-proof way, i.e. to make the model accessible in future
+Note: a model can also be saved as an R object (e.g., by using \code{\link[=readRDS]{readRDS()}}
+or \code{\link[=save]{save()}}). However, it would then only be compatible with R, and
+corresponding R methods would need to be used to load it. Moreover, persisting the model with
+\code{\link[=readRDS]{readRDS()}} or \code{\link[=save]{save()}} might cause compatibility problems in
+future versions of XGBoost. Consult \link{a-compatibility-note-for-saveRDS-save} to learn
+how to persist models in a future-proof way, i.e., to make the model accessible in future
 releases of XGBoost.
 }
 \examples{
 \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
-data(agaricus.train, package='xgboost')
-data(agaricus.test, package='xgboost')
+data(agaricus.train, package = "xgboost")
+data(agaricus.test, package = "xgboost")
 
 ## Keep the number of threads to 1 for examples
 nthread <- 1
@@ -51,6 +49,7 @@ data.table::setDTthreads(nthread)
 
 train <- agaricus.train
 test <- agaricus.test
+
 bst <- xgb.train(
   data = xgb.DMatrix(train$data, label = train$label),
   max_depth = 2,
@@ -59,10 +58,11 @@ bst <- xgb.train(
   nrounds = 2,
   objective = "binary:logistic"
 )
+
 fname <- file.path(tempdir(), "xgb.ubj")
 xgb.save(bst, fname)
 bst <- xgb.load(fname)
 }
 \seealso{
-\code{\link{xgb.load}}
+\code{\link[=xgb.load]{xgb.load()}}
 }
diff --git a/R-package/man/xgb.save.raw.Rd b/R-package/man/xgb.save.raw.Rd
index 6cdafd3d950c..d5b0d7cc9d6c 100644
--- a/R-package/man/xgb.save.raw.Rd
+++ b/R-package/man/xgb.save.raw.Rd
@@ -2,37 +2,44 @@
 % Please edit documentation in R/xgb.save.raw.R
 \name{xgb.save.raw}
 \alias{xgb.save.raw}
-\title{Save xgboost model to R's raw vector,
-user can call xgb.load.raw to load the model back from raw vector}
+\title{Save XGBoost model to R's raw vector}
 \usage{
 xgb.save.raw(model, raw_format = "ubj")
 }
 \arguments{
-\item{model}{the model object.}
+\item{model}{The model object.}
 
-\item{raw_format}{The format for encoding the booster.  Available options are
+\item{raw_format}{The format for encoding the booster:
 \itemize{
-\item \code{json}: Encode the booster into JSON text document.
-\item \code{ubj}:  Encode the booster into Universal Binary JSON.
-\item \code{deprecated}: Encode the booster into old customized binary format.
+\item "json": Encode the booster into JSON text document.
+\item "ubj":  Encode the booster into Universal Binary JSON.
+\item "deprecated": Encode the booster into old customized binary format.
 }}
 }
 \description{
-Save xgboost model from xgboost or xgb.train
+Save XGBoost model from \code{\link[=xgboost]{xgboost()}} or \code{\link[=xgb.train]{xgb.train()}}.
+Call \code{\link[=xgb.load.raw]{xgb.load.raw()}} to load the model back from raw vector.
 }
 \examples{
 \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
-data(agaricus.train, package='xgboost')
-data(agaricus.test, package='xgboost')
+data(agaricus.train, package = "xgboost")
+data(agaricus.test, package = "xgboost")
 
-## Keep the number of threads to 2 for examples
-nthread <- 2
+## Keep the number of threads to 1 for examples
+nthread <- 1
 data.table::setDTthreads(nthread)
 
 train <- agaricus.train
 test <- agaricus.test
-bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
-                 eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
+
+bst <- xgb.train(
+  data = xgb.DMatrix(train$data, label = train$label),
+  max_depth = 2,
+  eta = 1,
+  nthread = nthread,
+  nrounds = 2,
+  objective = "binary:logistic"
+)
 
 raw <- xgb.save.raw(bst)
 bst <- xgb.load.raw(raw)
diff --git a/R-package/man/xgb.slice.Booster.Rd b/R-package/man/xgb.slice.Booster.Rd
index 759139901f06..d245ced1bccf 100644
--- a/R-package/man/xgb.slice.Booster.Rd
+++ b/R-package/man/xgb.slice.Booster.Rd
@@ -18,10 +18,9 @@ xgb.slice.Booster(
 \item{model, x}{A fitted \code{xgb.Booster} object, which is to be sliced by taking only a subset
 of its rounds / iterations.}
 
-\item{start}{Start of the slice (base-1 and inclusive, like R's \link{seq}).}
-
-\item{end}{End of the slice (base-1 and inclusive, like R's \link{seq}).
+\item{start}{Start of the slice (base-1 and inclusive, like R's \code{\link[=seq]{seq()}}).}
 
+\item{end}{End of the slice (base-1 and inclusive, like R's \code{\link[=seq]{seq()}}).
 Passing a value of zero here is equivalent to passing the full number of rounds in the
 booster object.}
 
@@ -43,8 +42,10 @@ the resulting object.
 }
 \examples{
 data(mtcars)
+
 y <- mtcars$mpg
 x <- as.matrix(mtcars[, -1])
+
 dm <- xgb.DMatrix(x, label = y, nthread = 1)
 model <- xgb.train(data = dm, params = list(nthread = 1), nrounds = 5)
 model_slice <- xgb.slice.Booster(model, 1, 3)

From 508ac13243b95f7fa8006d244a6c1a93cd099e11 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 21 Aug 2024 02:50:26 +0800
Subject: [PATCH 11/19] Check cub errors. (#10721)

- Make sure cuda error returned by cub scan is caught.
- Avoid temporary buffer allocation in thrust device vector.
---
 src/data/ellpack_page.cu                        | 17 ++++++++++-------
 src/tree/gpu_hist/evaluate_splits.cu            |  4 +++-
 src/tree/gpu_hist/histogram.cu                  |  4 ++--
 src/tree/gpu_hist/row_partitioner.cuh           | 14 ++++++++------
 tests/cpp/tree/gpu_hist/test_row_partitioner.cu |  9 ++++-----
 5 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index b7ec72ad393c..bb279b3d8aa1 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -309,9 +309,9 @@ ELLPACK_BATCH_SPECIALIZE(data::CudfAdapterBatch)
 ELLPACK_BATCH_SPECIALIZE(data::CupyAdapterBatch)
 
 namespace {
-void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const> d_row_ptr,
-                        size_t row_stride, common::CompressedByteT* d_compressed_buffer,
-                        size_t null) {
+void CopyGHistToEllpack(Context const* ctx, GHistIndexMatrix const& page,
+                        common::Span<size_t const> d_row_ptr, size_t row_stride,
+                        common::CompressedByteT* d_compressed_buffer, size_t null) {
   dh::device_vector<uint8_t> data(page.index.begin(), page.index.end());
   auto d_data = dh::ToSpan(data);
 
@@ -323,7 +323,8 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
   common::CompressedBufferWriter writer{page.cut.TotalBins() +
                                         static_cast<std::size_t>(1)};  // +1 for null value
 
-  dh::LaunchN(row_stride * page.Size(), [=] __device__(size_t idx) mutable {
+  auto cuctx = ctx->CUDACtx();
+  dh::LaunchN(row_stride * page.Size(), cuctx->Stream(), [=] __device__(bst_idx_t idx) mutable {
     auto ridx = idx / row_stride;
     auto ifeature = idx % row_stride;
 
@@ -336,7 +337,7 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
       return;
     }
 
-    size_t offset = 0;
+    bst_idx_t offset = 0;
     if (!d_csc_indptr.empty()) {
       // is dense, ifeature is the actual feature index.
       offset = d_csc_indptr[ifeature];
@@ -362,7 +363,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
   row_stride = *std::max_element(it, it + page.Size());
 
   CHECK(ctx->IsCUDA());
-  InitCompressedData(ctx);
+  this->InitCompressedData(ctx);
 
   // copy gidx
   common::CompressedByteT* d_compressed_buffer = gidx_buffer.data();
@@ -373,7 +374,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
 
   auto accessor = this->GetDeviceAccessor(ctx->Device(), ft);
   auto null = accessor.NullValue();
-  CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null);
+  this->monitor_.Start("CopyGHistToEllpack");
+  CopyGHistToEllpack(ctx, page, d_row_ptr, row_stride, d_compressed_buffer, null);
+  this->monitor_.Stop("CopyGHistToEllpack");
 }
 
 // A functor that copies the data from one EllpackPage to another.
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 631f2bd8f27c..0131f166fa18 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -472,7 +472,9 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_
 
 GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
                                                      EvaluateSplitSharedInputs shared_inputs) {
-  dh::device_vector<EvaluateSplitInputs> inputs = std::vector<EvaluateSplitInputs>{input};
+  dh::device_vector<EvaluateSplitInputs> inputs(1);
+  dh::safe_cuda(cudaMemcpyAsync(inputs.data().get(), &input, sizeof(input), cudaMemcpyDefault));
+
   dh::TemporaryArray<GPUExpandEntry> out_entries(1);
   this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs),
                        shared_inputs, dh::ToSpan(out_entries));
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index e90b6831fcfd..731e71367990 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -325,7 +325,7 @@ class DeviceHistogramBuilderImpl {
   void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
                       FeatureGroupsAccessor const& feature_groups,
                       common::Span<GradientPair const> gpair,
-                      common::Span<const std::uint32_t> d_ridx,
+                      common::Span<const cuda_impl::RowIndexT> d_ridx,
                       common::Span<GradientPairInt64> histogram, GradientQuantiser rounding) {
     CHECK(kernel_);
     // Otherwise launch blocks such that each block has a minimum amount of work to do
@@ -369,7 +369,7 @@ void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx,
                                             EllpackDeviceAccessor const& matrix,
                                             FeatureGroupsAccessor const& feature_groups,
                                             common::Span<GradientPair const> gpair,
-                                            common::Span<const std::uint32_t> ridx,
+                                            common::Span<const cuda_impl::RowIndexT> ridx,
                                             common::Span<GradientPairInt64> histogram,
                                             GradientQuantiser rounding) {
   this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram, rounding);
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 5f8f0a30b31a..c754f84c06cd 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -132,7 +132,7 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
                        common::Span<cuda_impl::RowIndexT> ridx,
                        common::Span<cuda_impl::RowIndexT> ridx_tmp,
                        common::Span<cuda_impl::RowIndexT> d_counts, bst_idx_t total_rows, OpT op,
-                       dh::device_vector<int8_t>* tmp) {
+                       dh::DeviceUVector<int8_t>* tmp) {
   dh::LDGIterator<PerNodeData<OpDataT>> batch_info_itr(d_batch_info.data());
   WriteResultsFunctor<OpDataT> write_results{batch_info_itr, ridx.data(), ridx_tmp.data(),
                                              d_counts.data()};
@@ -150,14 +150,16 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
                               go_left};
       });
   std::size_t temp_bytes = 0;
+  // Restriction imposed by cub.
+  CHECK_LE(total_rows, static_cast<bst_idx_t>(std::numeric_limits<std::int32_t>::max()));
   if (tmp->empty()) {
-    cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
-                                   IndexFlagOp{}, total_rows);
+    dh::safe_cuda(cub::DeviceScan::InclusiveScan(
+        nullptr, temp_bytes, input_iterator, discard_write_iterator, IndexFlagOp{}, total_rows));
     tmp->resize(temp_bytes);
   }
   temp_bytes = tmp->size();
-  cub::DeviceScan::InclusiveScan(tmp->data().get(), temp_bytes, input_iterator,
-                                 discard_write_iterator, IndexFlagOp{}, total_rows);
+  dh::safe_cuda(cub::DeviceScan::InclusiveScan(tmp->data(), temp_bytes, input_iterator,
+                                               discard_write_iterator, IndexFlagOp{}, total_rows));
 
   constexpr int kBlockSize = 256;
 
@@ -236,7 +238,7 @@ class RowPartitioner {
   dh::DeviceUVector<RowIndexT> ridx_;
   // Staging area for sorting ridx
   dh::DeviceUVector<RowIndexT> ridx_tmp_;
-  dh::device_vector<int8_t> tmp_;
+  dh::DeviceUVector<int8_t> tmp_;
   dh::PinnedMemory pinned_;
   dh::PinnedMemory pinned2_;
   bst_node_t n_nodes_{0};  // Counter for internal checks.
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 86080a797beb..ec8372815a7c 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -49,9 +49,9 @@ void TestUpdatePositionBatch() {
 TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); }
 
 void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Segment>& segments) {
-  thrust::device_vector<uint32_t> ridx = ridx_in;
-  thrust::device_vector<uint32_t> ridx_tmp(ridx_in.size());
-  thrust::device_vector<bst_uint> counts(segments.size());
+  thrust::device_vector<cuda_impl::RowIndexT> ridx = ridx_in;
+  thrust::device_vector<cuda_impl::RowIndexT> ridx_tmp(ridx_in.size());
+  thrust::device_vector<cuda_impl::RowIndexT> counts(segments.size());
 
   auto op = [=] __device__(auto ridx, int split_index, int data) { return ridx % 2 == 0; };
   std::vector<int> op_data(segments.size());
@@ -66,7 +66,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
   dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
                                 h_batch_info.size() * sizeof(PerNodeData<int>), cudaMemcpyDefault,
                                 nullptr));
-  dh::device_vector<int8_t> tmp;
+  dh::DeviceUVector<int8_t> tmp;
   SortPositionBatch<decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
                                        dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op,
                                        &tmp);
@@ -91,5 +91,4 @@ TEST(GpuHist, SortPositionBatch) {
   TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 6}});
   TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}});
 }
-
 }  // namespace xgboost::tree

From adf87b27c52fb386fa01e5c417cee7f682d5b68f Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Tue, 20 Aug 2024 20:52:50 +0200
Subject: [PATCH 12/19] [doc]  Fix tutorial for advanced objectives (#10725)

---
 doc/tutorials/advanced_custom_obj.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/tutorials/advanced_custom_obj.rst b/doc/tutorials/advanced_custom_obj.rst
index b78cdc292eff..5b81b47167e1 100644
--- a/doc/tutorials/advanced_custom_obj.rst
+++ b/doc/tutorials/advanced_custom_obj.rst
@@ -360,7 +360,7 @@ point, which means it will be a minimum rather than a maximum or saddle point).
 
 But note that this is still not usable for XGBoost, since the expected
 Hessian, just like the true Hessian, has shape ``[nrows, k, k]``, while
-XGBoost requires something with shape ``[k, k]``.
+XGBoost requires something with shape ``[nrows, k]``.
 
 One may use the diagonal of the expected Hessian for each row, but it's
 possible to do better: one can use instead an upper bound with diagonal

From e9f1abc1f0637c13923c8282de54c4298a43c533 Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Tue, 20 Aug 2024 23:49:02 +0200
Subject: [PATCH 13/19] [R] keep row names in predictions (#10727)

---
 R-package/R/xgb.Booster.R               | 18 +++++++++
 R-package/src/init.c                    |  2 +
 R-package/src/xgboost_R.cc              |  5 +++
 R-package/src/xgboost_R.h               |  8 ++++
 R-package/tests/testthat/test_basic.R   | 50 +++++++++++++++++++++++--
 R-package/tests/testthat/test_dmatrix.R |  6 ++-
 6 files changed, 84 insertions(+), 5 deletions(-)

diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index 0e6313d88e71..a15285091850 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -354,6 +354,11 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
       " Should be passed as argument to 'xgb.DMatrix' constructor."
     )
   }
+  if (is_dmatrix) {
+    rnames <- NULL
+  } else {
+    rnames <- row.names(newdata)
+  }
 
   use_as_df <- FALSE
   use_as_dense_matrix <- FALSE
@@ -501,6 +506,19 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
     .Call(XGSetArrayDimNamesInplace_R, arr, dim_names)
   }
 
+  if (NROW(rnames)) {
+    if (is.null(dim(arr))) {
+      .Call(XGSetVectorNamesInplace_R, arr, rnames)
+    } else {
+      dim_names <- dimnames(arr)
+      if (is.null(dim_names)) {
+        dim_names <- vector(mode = "list", length = length(dim(arr)))
+      }
+      dim_names[[length(dim_names)]] <- rnames
+      .Call(XGSetArrayDimNamesInplace_R, arr, dim_names)
+    }
+  }
+
   if (!avoid_transpose && is.array(arr)) {
     arr <- aperm(arr)
   }
diff --git a/R-package/src/init.c b/R-package/src/init.c
index 16c1d3b14189..523e5118a6f5 100644
--- a/R-package/src/init.c
+++ b/R-package/src/init.c
@@ -46,6 +46,7 @@ extern SEXP XGBoosterSetParam_R(SEXP, SEXP, SEXP);
 extern SEXP XGBoosterUpdateOneIter_R(SEXP, SEXP, SEXP);
 extern SEXP XGCheckNullPtr_R(SEXP);
 extern SEXP XGSetArrayDimNamesInplace_R(SEXP, SEXP);
+extern SEXP XGSetVectorNamesInplace_R(SEXP, SEXP);
 extern SEXP XGDMatrixCreateFromCSC_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixCreateFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixCreateFromURI_R(SEXP, SEXP, SEXP);
@@ -108,6 +109,7 @@ static const R_CallMethodDef CallEntries[] = {
   {"XGBoosterUpdateOneIter_R",    (DL_FUNC) &XGBoosterUpdateOneIter_R,    3},
   {"XGCheckNullPtr_R",            (DL_FUNC) &XGCheckNullPtr_R,            1},
   {"XGSetArrayDimNamesInplace_R", (DL_FUNC) &XGSetArrayDimNamesInplace_R, 2},
+  {"XGSetVectorNamesInplace_R",   (DL_FUNC) &XGSetVectorNamesInplace_R,   2},
   {"XGDMatrixCreateFromCSC_R",    (DL_FUNC) &XGDMatrixCreateFromCSC_R,    6},
   {"XGDMatrixCreateFromCSR_R",    (DL_FUNC) &XGDMatrixCreateFromCSR_R,    6},
   {"XGDMatrixCreateFromURI_R",    (DL_FUNC) &XGDMatrixCreateFromURI_R,    3},
diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index 5faae8a9fda0..0e7234a18708 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -335,6 +335,11 @@ XGB_DLL SEXP XGSetArrayDimNamesInplace_R(SEXP arr, SEXP dim_names) {
   return R_NilValue;
 }
 
+XGB_DLL SEXP XGSetVectorNamesInplace_R(SEXP arr, SEXP names) {
+  Rf_setAttrib(arr, R_NamesSymbol, names);
+  return R_NilValue;
+}
+
 namespace {
 void _DMatrixFinalizer(SEXP ext) {
   R_API_BEGIN();
diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h
index 08f16bac109c..bfccd9f152db 100644
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -34,6 +34,14 @@ XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle);
  */
 XGB_DLL SEXP XGSetArrayDimNamesInplace_R(SEXP arr, SEXP dim_names);
 
+/*!
+ * \brief set the names of a vector in-place
+ * \param arr
+ * \param names names for the dimensions to set
+ * \return NULL value
+ */
+XGB_DLL SEXP XGSetVectorNamesInplace_R(SEXP arr, SEXP names);
+
 /*!
  * \brief Set global configuration
  * \param json_str a JSON string representing the list of key-value pairs
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index f0ebd7a1c9b5..840ff263523f 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -678,7 +678,7 @@ test_that("Can predict on data.frame objects", {
 
   pred_mat <- predict(model, xgb.DMatrix(x_mat))
   pred_df <- predict(model, x_df)
-  expect_equal(pred_mat, pred_df)
+  expect_equal(pred_mat, unname(pred_df))
 })
 
 test_that("'base_margin' gives the same result in DMatrix as in inplace_predict", {
@@ -702,7 +702,7 @@ test_that("'base_margin' gives the same result in DMatrix as in inplace_predict"
   pred_from_dm <- predict(model, dm_w_base)
   pred_from_mat <- predict(model, x, base_margin = base_margin)
 
-  expect_equal(pred_from_dm, pred_from_mat)
+  expect_equal(pred_from_dm, unname(pred_from_mat))
 })
 
 test_that("Coefficients from gblinear have the expected shape and names", {
@@ -725,7 +725,7 @@ test_that("Coefficients from gblinear have the expected shape and names", {
   expect_equal(names(coefs), c("(Intercept)", colnames(x)))
   pred_auto <- predict(model, x)
   pred_manual <- as.numeric(mm %*% coefs)
-  expect_equal(pred_manual, pred_auto, tolerance = 1e-5)
+  expect_equal(pred_manual, unname(pred_auto), tolerance = 1e-5)
 
   # Multi-column coefficients
   data(iris)
@@ -949,3 +949,47 @@ test_that("xgb.cv works for ranking", {
   )
   expect_equal(length(res$folds), 2L)
 })
+
+test_that("Row names are preserved in outputs", {
+  data(iris)
+  x <- iris[, -5]
+  y <- as.numeric(iris$Species) - 1
+  dm <- xgb.DMatrix(x, label = y, nthread = 1)
+  model <- xgb.train(
+    data = dm,
+    params = list(
+      objective = "multi:softprob",
+      num_class = 3,
+      max_depth = 2,
+      nthread = 1
+    ),
+    nrounds = 3
+  )
+  row.names(x) <- paste0("r", seq(1, nrow(x)))
+  pred <- predict(model, x)
+  expect_equal(row.names(pred), row.names(x))
+  pred <- predict(model, x, avoid_transpose = TRUE)
+  expect_equal(colnames(pred), row.names(x))
+
+  data(mtcars)
+  y <- mtcars[, 1]
+  x <- as.matrix(mtcars[, -1])
+  dm <- xgb.DMatrix(data = x, label = y)
+  model <- xgb.train(
+    data = dm,
+    params = list(
+      max_depth = 2,
+      nthread = 1
+    ),
+    nrounds = 3
+  )
+  row.names(x) <- paste0("r", seq(1, nrow(x)))
+  pred <- predict(model, x)
+  expect_equal(names(pred), row.names(x))
+  pred <- predict(model, x, avoid_transpose = TRUE)
+  expect_equal(names(pred), row.names(x))
+  pred <- predict(model, x, predleaf = TRUE)
+  expect_equal(row.names(pred), row.names(x))
+  pred <- predict(model, x, predleaf = TRUE, avoid_transpose = TRUE)
+  expect_equal(colnames(pred), row.names(x))
+})
diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R
index cca7b88da5a9..887f602be009 100644
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -493,6 +493,7 @@ test_that("xgb.DMatrix: ExternalDMatrix produces the same results as regular DMa
     nrounds = 5
   )
   pred <- predict(model, x)
+  pred <- unname(pred)
 
   iterator_env <- as.environment(
     list(
@@ -538,7 +539,7 @@ test_that("xgb.DMatrix: ExternalDMatrix produces the same results as regular DMa
   )
 
   pred_model1_edm <- predict(model, edm)
-  pred_model2_mat <- predict(model_ext, x)
+  pred_model2_mat <- predict(model_ext, x) |> unname()
   pred_model2_edm <- predict(model_ext, edm)
 
   expect_equal(pred_model1_edm, pred)
@@ -567,6 +568,7 @@ test_that("xgb.DMatrix: External QDM produces same results as regular QDM", {
     nrounds = 5
   )
   pred <- predict(model, x)
+  pred <- unname(pred)
 
   iterator_env <- as.environment(
     list(
@@ -616,7 +618,7 @@ test_that("xgb.DMatrix: External QDM produces same results as regular QDM", {
   )
 
   pred_model1_qdm <- predict(model, qdm)
-  pred_model2_mat <- predict(model_ext, x)
+  pred_model2_mat <- predict(model_ext, x) |> unname()
   pred_model2_qdm <- predict(model_ext, qdm)
 
   expect_equal(pred_model1_qdm, pred)

From 402e7837fb263b2a48397acc9f92656167fd01d0 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 21 Aug 2024 16:50:31 +0800
Subject: [PATCH 14/19] Fix potential race in feature constraint. (#10719)

---
 src/common/bitfield.h              | 16 +++++++++++-----
 src/tree/constraints.cu            | 10 ++++------
 tests/cpp/tree/test_constraints.cu | 15 +++++++--------
 3 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/src/common/bitfield.h b/src/common/bitfield.h
index 62107876490f..6ecd7fcdf5a0 100644
--- a/src/common/bitfield.h
+++ b/src/common/bitfield.h
@@ -108,9 +108,11 @@ struct BitFieldContainer {
 #if defined(__CUDA_ARCH__)
   __device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
     auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-    size_t min_size = min(NumValues(), rhs.NumValues());
+    std::size_t min_size = std::min(this->Capacity(), rhs.Capacity());
     if (tid < min_size) {
-      Data()[tid] |= rhs.Data()[tid];
+      if (this->Check(tid) || rhs.Check(tid)) {
+        this->Set(tid);
+      }
     }
     return *this;
   }
@@ -126,16 +128,20 @@ struct BitFieldContainer {
 
 #if defined(__CUDA_ARCH__)
   __device__ BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
-    size_t min_size = min(NumValues(), rhs.NumValues());
     auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+    std::size_t min_size = std::min(this->Capacity(), rhs.Capacity());
     if (tid < min_size) {
-      Data()[tid] &= rhs.Data()[tid];
+      if (this->Check(tid) && rhs.Check(tid)) {
+        this->Set(tid);
+      } else {
+        this->Clear(tid);
+      }
     }
     return *this;
   }
 #else
   BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
-    size_t min_size = std::min(NumValues(), rhs.NumValues());
+    std::size_t min_size = std::min(NumValues(), rhs.NumValues());
     for (size_t i = 0; i < min_size; ++i) {
       Data()[i] &= rhs.Data()[i];
     }
diff --git a/src/tree/constraints.cu b/src/tree/constraints.cu
index ae1d3073c7cc..121d800946b7 100644
--- a/src/tree/constraints.cu
+++ b/src/tree/constraints.cu
@@ -6,7 +6,6 @@
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
 
-#include <algorithm>
 #include <string>
 #include <set>
 
@@ -279,10 +278,6 @@ __global__ void InteractionConstraintSplitKernel(LBitField64 feature,
   }
   // enable constraints from feature
   node |= feature;
-  // clear the buffer after use
-  if (tid < feature.Capacity()) {
-    feature.Clear(tid);
-  }
 
   // enable constraints from parent
   left  |= node;
@@ -304,7 +299,7 @@ void FeatureInteractionConstraintDevice::Split(
       << " Split node: " << node_id << " and its left child: "
       << left_id << " cannot be the same.";
   CHECK_NE(node_id, right_id)
-      << " Split node: " << node_id << " and its left child: "
+      << " Split node: " << node_id << " and its right child: "
       << right_id << " cannot be the same.";
   CHECK_LT(right_id, s_node_constraints_.size());
   CHECK_NE(s_node_constraints_.size(), 0);
@@ -330,6 +325,9 @@ void FeatureInteractionConstraintDevice::Split(
       feature_buffer_,
       feature_id,
       node, left, right);
+
+  // clear the buffer after use
+  thrust::fill_n(dh::CachingThrustPolicy(), feature_buffer_.Data(), feature_buffer_.NumValues(), 0);
 }
 
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_constraints.cu b/tests/cpp/tree/test_constraints.cu
index 09e72a1d2bfa..2af54d892903 100644
--- a/tests/cpp/tree/test_constraints.cu
+++ b/tests/cpp/tree/test_constraints.cu
@@ -1,16 +1,17 @@
 /**
- * Copyright 2019-2023, XGBoost contributors
+ * Copyright 2019-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/copy.h>
 #include <thrust/device_vector.h>
-#include <cinttypes>
-#include <string>
-#include <bitset>
+
+#include <cstdint>
 #include <set>
+#include <string>
+
+#include "../../../src/common/device_helpers.cuh"
 #include "../../../src/tree/constraints.cuh"
 #include "../../../src/tree/param.h"
-#include "../../../src/common/device_helpers.cuh"
 
 namespace xgboost {
 namespace {
@@ -36,9 +37,7 @@ std::string GetConstraintsStr() {
 }
 
 tree::TrainParam GetParameter() {
-  std::vector<std::pair<std::string, std::string>> args{
-    {"interaction_constraints", GetConstraintsStr()}
-  };
+  Args args{{"interaction_constraints", GetConstraintsStr()}};
   tree::TrainParam param;
   param.Init(args);
   return param;

From 9b88495840785d5f8bd51985139592bae50d2ce7 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 22 Aug 2024 02:06:47 +0800
Subject: [PATCH 15/19] [multi] Implement weight feature importance. (#10700)

---
 src/gbm/gbtree.h                  | 25 +++++++++++++++----------
 tests/python/test_with_sklearn.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index d6ed851c835c..e8765d5c5447 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -236,12 +236,11 @@ class GBTree : public GradientBooster {
     auto add_score = [&](auto fn) {
       for (auto idx : trees) {
         CHECK_LE(idx, total_n_trees) << "Invalid tree index.";
-        auto const& p_tree = model_.trees[idx];
-        p_tree->WalkTree([&](bst_node_t nidx) {
-          auto const& node = (*p_tree)[nidx];
-          if (!node.IsLeaf()) {
-            split_counts[node.SplitIndex()]++;
-            fn(p_tree, nidx, node.SplitIndex());
+        auto const& tree = *model_.trees[idx];
+        tree.WalkTree([&](bst_node_t nidx) {
+          if (!tree.IsLeaf(nidx)) {
+            split_counts[tree.SplitIndex(nidx)]++;
+            fn(tree, nidx, tree.SplitIndex(nidx));
           }
           return true;
         });
@@ -253,12 +252,18 @@ class GBTree : public GradientBooster {
         gain_map[split] = split_counts[split];
       });
     } else if (importance_type == "gain" || importance_type == "total_gain") {
-      add_score([&](auto const &p_tree, bst_node_t nidx, bst_feature_t split) {
-        gain_map[split] += p_tree->Stat(nidx).loss_chg;
+      if (!model_.trees.empty() && model_.trees.front()->IsMultiTarget()) {
+        LOG(FATAL) << "gain/total_gain " << MTNotImplemented();
+      }
+      add_score([&](auto const& tree, bst_node_t nidx, bst_feature_t split) {
+        gain_map[split] += tree.Stat(nidx).loss_chg;
       });
     } else if (importance_type == "cover" || importance_type == "total_cover") {
-      add_score([&](auto const &p_tree, bst_node_t nidx, bst_feature_t split) {
-        gain_map[split] += p_tree->Stat(nidx).sum_hess;
+      if (!model_.trees.empty() && model_.trees.front()->IsMultiTarget()) {
+        LOG(FATAL) << "cover/total_cover " << MTNotImplemented();
+      }
+      add_score([&](auto const& tree, bst_node_t nidx, bst_feature_t split) {
+        gain_map[split] += tree.Stat(nidx).sum_hess;
       });
     } else {
       LOG(FATAL)
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 248c473551f4..bea201cafe45 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -336,6 +336,36 @@ def test_feature_importances_weight():
         cls.feature_importances_
 
 
+def test_feature_importances_weight_vector_leaf() -> None:
+    from sklearn.datasets import make_multilabel_classification
+
+    X, y = make_multilabel_classification(random_state=1994)
+    with pytest.raises(ValueError, match="gain/total_gain"):
+        clf = xgb.XGBClassifier(multi_strategy="multi_output_tree")
+        clf.fit(X, y)
+        clf.feature_importances_
+
+    with pytest.raises(ValueError, match="cover/total_cover"):
+        clf = xgb.XGBClassifier(
+            multi_strategy="multi_output_tree", importance_type="cover"
+        )
+        clf.fit(X, y)
+        clf.feature_importances_
+
+    clf = xgb.XGBClassifier(
+        multi_strategy="multi_output_tree",
+        importance_type="weight",
+        colsample_bynode=0.2,
+    )
+    clf.fit(X, y, feature_weights=np.arange(0, X.shape[1]))
+    fi = clf.feature_importances_
+    assert fi[0] == 0.0
+    assert fi[-1] > fi[1] * 5
+
+    w = np.polynomial.Polynomial.fit(np.arange(0, X.shape[1]), fi, deg=1)
+    assert w.coef[1] > 0.03
+
+
 @pytest.mark.skipif(**tm.no_pandas())
 def test_feature_importances_gain():
     from sklearn.datasets import load_digits

From 24d225c1abe1c97fa039c7718c4f294a59a531c3 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <d.razdoburdin@gmail.com>
Date: Wed, 21 Aug 2024 20:07:44 +0200
Subject: [PATCH 16/19] [SYCL] Implement UpdatePredictionCache and connect
 updater with leraner. (#10701)

---------

Co-authored-by: Dmitry Razdoburdin <>
---
 plugin/sycl/tree/hist_updater.cc              | 95 ++++++++++++++++++
 plugin/sycl/tree/hist_updater.h               | 19 +++-
 plugin/sycl/tree/updater_quantile_hist.cc     | 92 +++++++++++++++++-
 plugin/sycl/tree/updater_quantile_hist.h      | 31 +++++-
 src/gbm/gbtree.cc                             |  3 +-
 tests/cpp/plugin/test_sycl_hist_updater.cc    | 38 ++------
 .../cpp/plugin/test_sycl_prediction_cache.cc  | 23 +++++
 tests/cpp/tree/test_prediction_cache.cc       | 91 +----------------
 tests/cpp/tree/test_prediction_cache.h        | 97 +++++++++++++++++++
 .../test_sycl_training_continuation.py        | 59 +++++++++++
 tests/python-sycl/test_sycl_updaters.py       | 80 +++++++++++++++
 11 files changed, 502 insertions(+), 126 deletions(-)
 create mode 100644 tests/cpp/plugin/test_sycl_prediction_cache.cc
 create mode 100644 tests/cpp/tree/test_prediction_cache.h
 create mode 100644 tests/python-sycl/test_sycl_training_continuation.py
 create mode 100644 tests/python-sycl/test_sycl_updaters.py

diff --git a/plugin/sycl/tree/hist_updater.cc b/plugin/sycl/tree/hist_updater.cc
index efaddafdb9d8..097e2da7384f 100644
--- a/plugin/sycl/tree/hist_updater.cc
+++ b/plugin/sycl/tree/hist_updater.cc
@@ -307,6 +307,99 @@ void HistUpdater<GradientSumT>::ExpandWithLossGuide(
   builder_monitor_.Stop("ExpandWithLossGuide");
 }
 
+template <typename GradientSumT>
+void HistUpdater<GradientSumT>::Update(
+    xgboost::tree::TrainParam const *param,
+    const common::GHistIndexMatrix &gmat,
+    const USMVector<GradientPair, MemoryType::on_device>& gpair,
+    DMatrix *p_fmat,
+    xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
+    RegTree *p_tree) {
+  builder_monitor_.Start("Update");
+
+  tree_evaluator_.Reset(qu_, param_, p_fmat->Info().num_col_);
+  interaction_constraints_.Reset();
+
+  this->InitData(gmat, gpair, *p_fmat, *p_tree);
+  if (param_.grow_policy == xgboost::tree::TrainParam::kLossGuide) {
+    ExpandWithLossGuide(gmat, p_tree, gpair);
+  } else {
+    ExpandWithDepthWise(gmat, p_tree, gpair);
+  }
+
+  for (int nid = 0; nid < p_tree->NumNodes(); ++nid) {
+    p_tree->Stat(nid).loss_chg = snode_host_[nid].best.loss_chg;
+    p_tree->Stat(nid).base_weight = snode_host_[nid].weight;
+    p_tree->Stat(nid).sum_hess = static_cast<float>(snode_host_[nid].stats.GetHess());
+  }
+
+  builder_monitor_.Stop("Update");
+}
+
+template<typename GradientSumT>
+bool HistUpdater<GradientSumT>::UpdatePredictionCache(
+    const DMatrix* data,
+    linalg::MatrixView<float> out_preds) {
+  // p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in
+  // conjunction with Update().
+  if (!p_last_fmat_ || !p_last_tree_ || data != p_last_fmat_) {
+    return false;
+  }
+  builder_monitor_.Start("UpdatePredictionCache");
+  CHECK_GT(out_preds.Size(), 0U);
+
+  const size_t stride = out_preds.Stride(0);
+  const bool is_first_group = (out_pred_ptr == nullptr);
+  const size_t gid = out_pred_ptr == nullptr ? 0 : &out_preds(0) - out_pred_ptr;
+  const bool is_last_group = (gid + 1 == stride);
+
+  const int buffer_size = out_preds.Size() *stride;
+  if (buffer_size == 0) return true;
+
+  ::sycl::event event;
+  if (is_first_group) {
+    out_preds_buf_.ResizeNoCopy(&qu_, buffer_size);
+    out_pred_ptr = &out_preds(0);
+    event = qu_.memcpy(out_preds_buf_.Data(), out_pred_ptr, buffer_size * sizeof(bst_float), event);
+  }
+  auto* out_preds_buf_ptr = out_preds_buf_.Data();
+
+  size_t n_nodes = row_set_collection_.Size();
+  std::vector<::sycl::event> events(n_nodes);
+  for (size_t node = 0; node < n_nodes; node++) {
+    const common::RowSetCollection::Elem& rowset = row_set_collection_[node];
+    if (rowset.begin != nullptr && rowset.end != nullptr && rowset.Size() != 0) {
+      int nid = rowset.node_id;
+      // if a node is marked as deleted by the pruner, traverse upward to locate
+      // a non-deleted leaf.
+      if ((*p_last_tree_)[nid].IsDeleted()) {
+        while ((*p_last_tree_)[nid].IsDeleted()) {
+          nid = (*p_last_tree_)[nid].Parent();
+        }
+        CHECK((*p_last_tree_)[nid].IsLeaf());
+      }
+      bst_float leaf_value = (*p_last_tree_)[nid].LeafValue();
+      const size_t* rid = rowset.begin;
+      const size_t num_rows = rowset.Size();
+
+      events[node] = qu_.submit([&](::sycl::handler& cgh) {
+        cgh.depends_on(event);
+        cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::item<1> pid) {
+          out_preds_buf_ptr[rid[pid.get_id(0)]*stride + gid] += leaf_value;
+        });
+      });
+    }
+  }
+  if (is_last_group) {
+    qu_.memcpy(out_pred_ptr, out_preds_buf_ptr, buffer_size * sizeof(bst_float), events);
+    out_pred_ptr = nullptr;
+  }
+  qu_.wait();
+
+  builder_monitor_.Stop("UpdatePredictionCache");
+  return true;
+}
+
 template<typename GradientSumT>
 void HistUpdater<GradientSumT>::InitSampling(
       const USMVector<GradientPair, MemoryType::on_device> &gpair,
@@ -479,6 +572,8 @@ void HistUpdater<GradientSumT>::InitData(
     }
   }
 
+  // store a pointer to the tree
+  p_last_tree_ = &tree;
   column_sampler_->Init(ctx_, info.num_col_, info.feature_weights.ConstHostVector(),
                         param_.colsample_bynode, param_.colsample_bylevel,
                         param_.colsample_bytree);
diff --git a/plugin/sycl/tree/hist_updater.h b/plugin/sycl/tree/hist_updater.h
index 5e0ca6645974..fd5fdda9433d 100644
--- a/plugin/sycl/tree/hist_updater.h
+++ b/plugin/sycl/tree/hist_updater.h
@@ -11,10 +11,10 @@
 #include <xgboost/tree_updater.h>
 #pragma GCC diagnostic pop
 
-#include <utility>
 #include <vector>
 #include <memory>
 #include <queue>
+#include <utility>
 
 #include "../common/partition_builder.h"
 #include "split_evaluator.h"
@@ -54,12 +54,10 @@ class HistUpdater {
   explicit HistUpdater(const Context* ctx,
                        ::sycl::queue qu,
                        const xgboost::tree::TrainParam& param,
-                       std::unique_ptr<TreeUpdater> pruner,
                        FeatureInteractionConstraintHost int_constraints_,
                        DMatrix const* fmat)
     : ctx_(ctx), qu_(qu), param_(param),
       tree_evaluator_(qu, param, fmat->Info().num_col_),
-      pruner_(std::move(pruner)),
       interaction_constraints_{std::move(int_constraints_)},
       p_last_tree_(nullptr), p_last_fmat_(fmat) {
     builder_monitor_.Init("SYCL::Quantile::HistUpdater");
@@ -73,6 +71,17 @@ class HistUpdater {
     sub_group_size_ = sub_group_sizes.back();
   }
 
+  // update one tree, growing
+  void Update(xgboost::tree::TrainParam const *param,
+              const common::GHistIndexMatrix &gmat,
+              const USMVector<GradientPair, MemoryType::on_device>& gpair,
+              DMatrix *p_fmat,
+              xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
+              RegTree *p_tree);
+
+  bool UpdatePredictionCache(const DMatrix* data,
+                              linalg::MatrixView<float> p_out_preds);
+
   void SetHistSynchronizer(HistSynchronizer<GradientSumT>* sync);
   void SetHistRowsAdder(HistRowsAdder<GradientSumT>* adder);
 
@@ -200,7 +209,6 @@ class HistUpdater {
   std::vector<SplitEntry<GradientSumT>> best_splits_host_;
 
   TreeEvaluator<GradientSumT> tree_evaluator_;
-  std::unique_ptr<TreeUpdater> pruner_;
   FeatureInteractionConstraintHost interaction_constraints_;
 
   // back pointers to tree and data matrix
@@ -247,6 +255,9 @@ class HistUpdater {
   std::unique_ptr<HistSynchronizer<GradientSumT>> hist_synchronizer_;
   std::unique_ptr<HistRowsAdder<GradientSumT>> hist_rows_adder_;
 
+  USMVector<bst_float, MemoryType::on_device> out_preds_buf_;
+  bst_float* out_pred_ptr = nullptr;
+
   ::sycl::queue qu_;
 };
 
diff --git a/plugin/sycl/tree/updater_quantile_hist.cc b/plugin/sycl/tree/updater_quantile_hist.cc
index 98a42c3c8ba0..ee7a7ad0f101 100644
--- a/plugin/sycl/tree/updater_quantile_hist.cc
+++ b/plugin/sycl/tree/updater_quantile_hist.cc
@@ -3,6 +3,7 @@
  * \file updater_quantile_hist.cc
  */
 #include <vector>
+#include <memory>
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wtautological-constant-compare"
@@ -29,6 +30,50 @@ void QuantileHistMaker::Configure(const Args& args) {
 
   param_.UpdateAllowUnknown(args);
   hist_maker_param_.UpdateAllowUnknown(args);
+
+  bool has_fp64_support = qu_.get_device().has(::sycl::aspect::fp64);
+  if (hist_maker_param_.single_precision_histogram || !has_fp64_support) {
+    if (!hist_maker_param_.single_precision_histogram) {
+      LOG(WARNING) << "Target device doesn't support fp64, using single_precision_histogram=True";
+    }
+    hist_precision_ = HistPrecision::fp32;
+  } else {
+    hist_precision_ = HistPrecision::fp64;
+  }
+}
+
+template<typename GradientSumT>
+void QuantileHistMaker::SetPimpl(std::unique_ptr<HistUpdater<GradientSumT>>* pimpl,
+                                 DMatrix *dmat) {
+  pimpl->reset(new HistUpdater<GradientSumT>(
+                ctx_,
+                qu_,
+                param_,
+                int_constraint_, dmat));
+  if (collective::IsDistributed()) {
+    LOG(FATAL) << "Distributed mode is not yet upstreamed for sycl";
+  } else {
+    (*pimpl)->SetHistSynchronizer(new BatchHistSynchronizer<GradientSumT>());
+    (*pimpl)->SetHistRowsAdder(new BatchHistRowsAdder<GradientSumT>());
+  }
+}
+
+template<typename GradientSumT>
+void QuantileHistMaker::CallUpdate(
+        const std::unique_ptr<HistUpdater<GradientSumT>>& pimpl,
+        xgboost::tree::TrainParam const *param,
+        linalg::Matrix<GradientPair> *gpair,
+        DMatrix *dmat,
+        xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
+        const std::vector<RegTree *> &trees) {
+  const auto* gpair_h = gpair->Data();
+  gpair_device_.Resize(&qu_, gpair_h->Size());
+  qu_.memcpy(gpair_device_.Data(), gpair_h->HostPointer(), gpair_h->Size() * sizeof(GradientPair));
+  qu_.wait();
+
+  for (auto tree : trees) {
+    pimpl->Update(param, gmat_, gpair_device_, dmat, out_position, tree);
+  }
 }
 
 void QuantileHistMaker::Update(xgboost::tree::TrainParam const *param,
@@ -36,12 +81,55 @@ void QuantileHistMaker::Update(xgboost::tree::TrainParam const *param,
                                DMatrix *dmat,
                                xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
                                const std::vector<RegTree *> &trees) {
-  LOG(FATAL) << "Not Implemented yet";
+  if (dmat != p_last_dmat_ || is_gmat_initialized_ == false) {
+    updater_monitor_.Start("DeviceMatrixInitialization");
+    sycl::DeviceMatrix dmat_device;
+    dmat_device.Init(qu_, dmat);
+    updater_monitor_.Stop("DeviceMatrixInitialization");
+    updater_monitor_.Start("GmatInitialization");
+    gmat_.Init(qu_, ctx_, dmat_device, static_cast<uint32_t>(param_.max_bin));
+    updater_monitor_.Stop("GmatInitialization");
+    is_gmat_initialized_ = true;
+  }
+  // rescale learning rate according to size of trees
+  float lr = param_.learning_rate;
+  param_.learning_rate = lr / trees.size();
+  int_constraint_.Configure(param_, dmat->Info().num_col_);
+  // build tree
+  if (hist_precision_ == HistPrecision::fp32) {
+    if (!pimpl_fp32) {
+      SetPimpl(&pimpl_fp32, dmat);
+    }
+    CallUpdate(pimpl_fp32, param, gpair, dmat, out_position, trees);
+  } else {
+    if (!pimpl_fp64) {
+      SetPimpl(&pimpl_fp64, dmat);
+    }
+    CallUpdate(pimpl_fp64, param, gpair, dmat, out_position, trees);
+  }
+
+  param_.learning_rate = lr;
+
+  p_last_dmat_ = dmat;
 }
 
 bool QuantileHistMaker::UpdatePredictionCache(const DMatrix* data,
                                               linalg::MatrixView<float> out_preds) {
-  LOG(FATAL) << "Not Implemented yet";
+  if (param_.subsample < 1.0f) return false;
+
+  if (hist_precision_ == HistPrecision::fp32) {
+    if (pimpl_fp32) {
+      return pimpl_fp32->UpdatePredictionCache(data, out_preds);
+    } else {
+      return false;
+    }
+  } else {
+    if (pimpl_fp64) {
+      return pimpl_fp64->UpdatePredictionCache(data, out_preds);
+    } else {
+      return false;
+    }
+  }
 }
 
 XGBOOST_REGISTER_TREE_UPDATER(QuantileHistMaker, "grow_quantile_histmaker_sycl")
diff --git a/plugin/sycl/tree/updater_quantile_hist.h b/plugin/sycl/tree/updater_quantile_hist.h
index 93a50de3e449..693255b26157 100644
--- a/plugin/sycl/tree/updater_quantile_hist.h
+++ b/plugin/sycl/tree/updater_quantile_hist.h
@@ -9,6 +9,7 @@
 #include <xgboost/tree_updater.h>
 
 #include <vector>
+#include <memory>
 
 #include "../data/gradient_index.h"
 #include "../common/hist_util.h"
@@ -16,8 +17,9 @@
 #include "../common/partition_builder.h"
 #include "split_evaluator.h"
 #include "../device_manager.h"
-
+#include "hist_updater.h"
 #include "xgboost/data.h"
+
 #include "xgboost/json.h"
 #include "../../src/tree/constraints.h"
 #include "../../src/common/random.h"
@@ -75,12 +77,39 @@ class QuantileHistMaker: public TreeUpdater {
   HistMakerTrainParam hist_maker_param_;
   // training parameter
   xgboost::tree::TrainParam param_;
+  // quantized data matrix
+  common::GHistIndexMatrix gmat_;
+  // (optional) data matrix with feature grouping
+  // column accessor
+  DMatrix const* p_last_dmat_ {nullptr};
+  bool is_gmat_initialized_ {false};
 
   xgboost::common::Monitor updater_monitor_;
 
+  template<typename GradientSumT>
+  void SetPimpl(std::unique_ptr<HistUpdater<GradientSumT>>*, DMatrix *dmat);
+
+  template<typename GradientSumT>
+  void CallUpdate(const std::unique_ptr<HistUpdater<GradientSumT>>& builder,
+                  xgboost::tree::TrainParam const *param,
+                  linalg::Matrix<GradientPair> *gpair,
+                  DMatrix *dmat,
+                  xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
+                  const std::vector<RegTree *> &trees);
+
+  enum class HistPrecision {fp32, fp64};
+  HistPrecision hist_precision_;
+
+  std::unique_ptr<HistUpdater<float>> pimpl_fp32;
+  std::unique_ptr<HistUpdater<double>> pimpl_fp64;
+
+  FeatureInteractionConstraintHost int_constraint_;
+
   ::sycl::queue qu_;
   DeviceManager device_manager;
   ObjInfo const *task_{nullptr};
+
+  USMVector<GradientPair, MemoryType::on_device> gpair_device_;
 };
 
 
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 26c768fafea7..fe640ee000c3 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -52,7 +52,8 @@ std::string MapTreeMethodToUpdaters(Context const* ctx, TreeMethod tree_method)
     case TreeMethod::kAuto:  // Use hist as default in 2.0
     case TreeMethod::kHist: {
       return ctx->DispatchDevice([] { return "grow_quantile_histmaker"; },
-                                 [] { return "grow_gpu_hist"; });
+                                 [] { return "grow_gpu_hist"; },
+                                 [] { return "grow_quantile_histmaker_sycl"; });
     }
     case TreeMethod::kApprox: {
       return ctx->DispatchDevice([] { return "grow_histmaker"; }, [] { return "grow_gpu_approx"; });
diff --git a/tests/cpp/plugin/test_sycl_hist_updater.cc b/tests/cpp/plugin/test_sycl_hist_updater.cc
index 7789b44381dd..a341f4645e60 100644
--- a/tests/cpp/plugin/test_sycl_hist_updater.cc
+++ b/tests/cpp/plugin/test_sycl_hist_updater.cc
@@ -21,10 +21,8 @@ class TestHistUpdater : public HistUpdater<GradientSumT> {
   TestHistUpdater(const Context* ctx,
                   ::sycl::queue qu,
                   const xgboost::tree::TrainParam& param,
-                  std::unique_ptr<TreeUpdater> pruner,
                   FeatureInteractionConstraintHost int_constraints_,
                   DMatrix const* fmat) : HistUpdater<GradientSumT>(ctx, qu, param,
-                                                                   std::move(pruner),
                                                                    int_constraints_, fmat) {}
 
   void TestInitSampling(const USMVector<GradientPair, MemoryType::on_device> &gpair,
@@ -110,14 +108,12 @@ void TestHistUpdaterSampling(const xgboost::tree::TrainParam& param) {
 
   DeviceManager device_manager;
   auto qu = device_manager.GetQueue(ctx.Device());
-  ObjInfo task{ObjInfo::kRegression};
 
   auto p_fmat = RandomDataGenerator{num_rows, num_columns, 0.0}.GenerateDMatrix();
 
   FeatureInteractionConstraintHost int_constraints;
-  std::unique_ptr<TreeUpdater> pruner{TreeUpdater::Create("prune", &ctx, &task)};
 
-  TestHistUpdater<GradientSumT> updater(&ctx, qu, param, std::move(pruner), int_constraints, p_fmat.get());
+  TestHistUpdater<GradientSumT> updater(&ctx, qu, param, int_constraints, p_fmat.get());
 
   USMVector<size_t, MemoryType::on_device> row_indices_0(&qu, num_rows);
   USMVector<size_t, MemoryType::on_device> row_indices_1(&qu, num_rows);
@@ -165,14 +161,12 @@ void TestHistUpdaterInitData(const xgboost::tree::TrainParam& param, bool has_ne
 
   DeviceManager device_manager;
   auto qu = device_manager.GetQueue(ctx.Device());
-  ObjInfo task{ObjInfo::kRegression};
 
   auto p_fmat = RandomDataGenerator{num_rows, num_columns, 0.0}.GenerateDMatrix();
 
   FeatureInteractionConstraintHost int_constraints;
-  std::unique_ptr<TreeUpdater> pruner{TreeUpdater::Create("prune", &ctx, &task)};
 
-  TestHistUpdater<GradientSumT> updater(&ctx, qu, param, std::move(pruner), int_constraints, p_fmat.get());
+  TestHistUpdater<GradientSumT> updater(&ctx, qu, param, int_constraints, p_fmat.get());
 
   USMVector<GradientPair, MemoryType::on_device> gpair(&qu, num_rows);
   GenerateRandomGPairs(&qu, gpair.Data(), num_rows, has_neg_hess);
@@ -221,14 +215,12 @@ void TestHistUpdaterBuildHistogramsLossGuide(const xgboost::tree::TrainParam& pa
 
   DeviceManager device_manager;
   auto qu = device_manager.GetQueue(ctx.Device());
-  ObjInfo task{ObjInfo::kRegression};
 
   auto p_fmat = RandomDataGenerator{num_rows, num_columns, sparsity}.GenerateDMatrix();
 
   FeatureInteractionConstraintHost int_constraints;
-  std::unique_ptr<TreeUpdater> pruner{TreeUpdater::Create("prune", &ctx, &task)};
 
-  TestHistUpdater<GradientSumT> updater(&ctx, qu, param, std::move(pruner), int_constraints, p_fmat.get());
+  TestHistUpdater<GradientSumT> updater(&ctx, qu, param, int_constraints, p_fmat.get());
   updater.SetHistSynchronizer(new BatchHistSynchronizer<GradientSumT>());
   updater.SetHistRowsAdder(new BatchHistRowsAdder<GradientSumT>());
 
@@ -285,14 +277,12 @@ void TestHistUpdaterInitNewNode(const xgboost::tree::TrainParam& param, float sp
 
   DeviceManager device_manager;
   auto qu = device_manager.GetQueue(ctx.Device());
-  ObjInfo task{ObjInfo::kRegression};
 
   auto p_fmat = RandomDataGenerator{num_rows, num_columns, sparsity}.GenerateDMatrix();
 
   FeatureInteractionConstraintHost int_constraints;
-  std::unique_ptr<TreeUpdater> pruner{TreeUpdater::Create("prune", &ctx, &task)};
 
-  TestHistUpdater<GradientSumT> updater(&ctx, qu, param, std::move(pruner), int_constraints, p_fmat.get());
+  TestHistUpdater<GradientSumT> updater(&ctx, qu, param, int_constraints, p_fmat.get());
   updater.SetHistSynchronizer(new BatchHistSynchronizer<GradientSumT>());
   updater.SetHistRowsAdder(new BatchHistRowsAdder<GradientSumT>());
 
@@ -345,14 +335,12 @@ void TestHistUpdaterEvaluateSplits(const xgboost::tree::TrainParam& param) {
 
   DeviceManager device_manager;
   auto qu = device_manager.GetQueue(ctx.Device());
-  ObjInfo task{ObjInfo::kRegression};
 
   auto p_fmat = RandomDataGenerator{num_rows, num_columns, 0.0f}.GenerateDMatrix();
 
   FeatureInteractionConstraintHost int_constraints;
-  std::unique_ptr<TreeUpdater> pruner{TreeUpdater::Create("prune", &ctx, &task)};
 
-  TestHistUpdater<GradientSumT> updater(&ctx, qu, param, std::move(pruner), int_constraints, p_fmat.get());
+  TestHistUpdater<GradientSumT> updater(&ctx, qu, param, int_constraints, p_fmat.get());
   updater.SetHistSynchronizer(new BatchHistSynchronizer<GradientSumT>());
   updater.SetHistRowsAdder(new BatchHistRowsAdder<GradientSumT>());
 
@@ -423,8 +411,6 @@ void TestHistUpdaterApplySplit(const xgboost::tree::TrainParam& param, float spa
   DeviceManager device_manager;
   auto qu = device_manager.GetQueue(ctx.Device());
 
-  ObjInfo task{ObjInfo::kRegression}; 
-
   auto p_fmat = RandomDataGenerator{num_rows, num_columns, sparsity}.GenerateDMatrix();
   sycl::DeviceMatrix dmat;
   dmat.Init(qu, p_fmat.get());
@@ -439,8 +425,7 @@ void TestHistUpdaterApplySplit(const xgboost::tree::TrainParam& param, float spa
   nodes.emplace_back(tree::ExpandEntry(0, tree.GetDepth(0)));
 
   FeatureInteractionConstraintHost int_constraints;
-  std::unique_ptr<TreeUpdater> pruner{TreeUpdater::Create("prune", &ctx, &task)};
-  TestHistUpdater<GradientSumT> updater(&ctx, qu, param, std::move(pruner), int_constraints, p_fmat.get());
+  TestHistUpdater<GradientSumT> updater(&ctx, qu, param, int_constraints, p_fmat.get());
   USMVector<GradientPair, MemoryType::on_device> gpair(&qu, num_rows);
   GenerateRandomGPairs(&qu, gpair.Data(), num_rows, false);
 
@@ -455,8 +440,7 @@ void TestHistUpdaterApplySplit(const xgboost::tree::TrainParam& param, float spa
   std::vector<size_t> row_indices_desired_host(num_rows);
   size_t n_left, n_right;
   {
-    std::unique_ptr<TreeUpdater> pruner4verification{TreeUpdater::Create("prune", &ctx, &task)};
-    TestHistUpdater<GradientSumT> updater4verification(&ctx, qu, param, std::move(pruner4verification), int_constraints, p_fmat.get());
+    TestHistUpdater<GradientSumT> updater4verification(&ctx, qu, param, int_constraints, p_fmat.get());
     auto* row_set_collection4verification = updater4verification.TestInitData(gmat, gpair, *p_fmat, tree);
 
     size_t n_nodes = nodes.size();
@@ -526,9 +510,7 @@ void TestHistUpdaterExpandWithLossGuide(const xgboost::tree::TrainParam& param)
 
   RegTree tree;
   FeatureInteractionConstraintHost int_constraints;
-  ObjInfo task{ObjInfo::kRegression};
-  std::unique_ptr<TreeUpdater> pruner{TreeUpdater::Create("prune", &ctx, &task)};
-  TestHistUpdater<GradientSumT> updater(&ctx, qu, param, std::move(pruner), int_constraints, p_fmat.get());
+  TestHistUpdater<GradientSumT> updater(&ctx, qu, param, int_constraints, p_fmat.get());
   updater.SetHistSynchronizer(new BatchHistSynchronizer<GradientSumT>());
   updater.SetHistRowsAdder(new BatchHistRowsAdder<GradientSumT>());
   auto* row_set_collection = updater.TestInitData(gmat, gpair, *p_fmat, tree);
@@ -576,9 +558,7 @@ void TestHistUpdaterExpandWithDepthWise(const xgboost::tree::TrainParam& param)
 
   RegTree tree;
   FeatureInteractionConstraintHost int_constraints;
-  ObjInfo task{ObjInfo::kRegression};
-  std::unique_ptr<TreeUpdater> pruner{TreeUpdater::Create("prune", &ctx, &task)};
-  TestHistUpdater<GradientSumT> updater(&ctx, qu, param, std::move(pruner), int_constraints, p_fmat.get());
+  TestHistUpdater<GradientSumT> updater(&ctx, qu, param, int_constraints, p_fmat.get());
   updater.SetHistSynchronizer(new BatchHistSynchronizer<GradientSumT>());
   updater.SetHistRowsAdder(new BatchHistRowsAdder<GradientSumT>());
   auto* row_set_collection = updater.TestInitData(gmat, gpair, *p_fmat, tree);
diff --git a/tests/cpp/plugin/test_sycl_prediction_cache.cc b/tests/cpp/plugin/test_sycl_prediction_cache.cc
new file mode 100644
index 000000000000..43f99dc63925
--- /dev/null
+++ b/tests/cpp/plugin/test_sycl_prediction_cache.cc
@@ -0,0 +1,23 @@
+/**
+ * Copyright 2020-2024 by XGBoost contributors
+ */
+#include <gtest/gtest.h>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include "../tree/test_prediction_cache.h"
+#pragma GCC diagnostic pop
+
+namespace xgboost::sycl::tree {
+
+class SyclPredictionCache : public xgboost::TestPredictionCache {};
+
+TEST_F(SyclPredictionCache, Hist) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+
+  this->RunTest(&ctx, "grow_quantile_histmaker_sycl", "one_output_per_tree");
+}
+
+}  // namespace xgboost::sycl::tree
diff --git a/tests/cpp/tree/test_prediction_cache.cc b/tests/cpp/tree/test_prediction_cache.cc
index fc1d0508797c..5c22ace418cc 100644
--- a/tests/cpp/tree/test_prediction_cache.cc
+++ b/tests/cpp/tree/test_prediction_cache.cc
@@ -2,97 +2,10 @@
  * Copyright 2021-2023 by XGBoost contributors
  */
 #include <gtest/gtest.h>
-#include <xgboost/host_device_vector.h>
-#include <xgboost/tree_updater.h>
 
-#include <memory>
-
-#include "../../../src/tree/param.h"  // for TrainParam
-#include "../helpers.h"
-#include "xgboost/task.h"             // for ObjInfo
+#include "test_prediction_cache.h"
 
 namespace xgboost {
-
-class TestPredictionCache : public ::testing::Test {
-  std::shared_ptr<DMatrix> Xy_;
-  std::size_t n_samples_{2048};
-
- protected:
-  void SetUp() override {
-    std::size_t n_features = 13;
-    bst_target_t n_targets = 3;
-    Xy_ = RandomDataGenerator{n_samples_, n_features, 0}.Targets(n_targets).GenerateDMatrix(true);
-  }
-
-  void RunLearnerTest(Context const* ctx, std::string updater_name, float subsample,
-                      std::string const& grow_policy, std::string const& strategy) {
-    std::unique_ptr<Learner> learner{Learner::Create({Xy_})};
-    learner->SetParam("device", ctx->DeviceName());
-    learner->SetParam("updater", updater_name);
-    learner->SetParam("multi_strategy", strategy);
-    learner->SetParam("grow_policy", grow_policy);
-    learner->SetParam("subsample", std::to_string(subsample));
-    learner->SetParam("nthread", "0");
-    learner->Configure();
-
-    for (size_t i = 0; i < 8; ++i) {
-      learner->UpdateOneIter(i, Xy_);
-    }
-
-    HostDeviceVector<float> out_prediction_cached;
-    learner->Predict(Xy_, false, &out_prediction_cached, 0, 0);
-
-    Json model{Object()};
-    learner->SaveModel(&model);
-
-    HostDeviceVector<float> out_prediction;
-    {
-      std::unique_ptr<Learner> learner{Learner::Create({Xy_})};
-      learner->LoadModel(model);
-      learner->Predict(Xy_, false, &out_prediction, 0, 0);
-    }
-
-    auto const h_predt_cached = out_prediction_cached.ConstHostSpan();
-    auto const h_predt = out_prediction.ConstHostSpan();
-
-    ASSERT_EQ(h_predt.size(), h_predt_cached.size());
-    for (size_t i = 0; i < h_predt.size(); ++i) {
-      ASSERT_NEAR(h_predt[i], h_predt_cached[i], kRtEps);
-    }
-  }
-
-  void RunTest(Context* ctx, std::string const& updater_name, std::string const& strategy) {
-    {
-      ctx->InitAllowUnknown(Args{{"nthread", "8"}});
-
-      ObjInfo task{ObjInfo::kRegression};
-      std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create(updater_name, ctx, &task)};
-      RegTree tree;
-      std::vector<RegTree*> trees{&tree};
-      auto gpair = GenerateRandomGradients(ctx, n_samples_, 1);
-      tree::TrainParam param;
-      param.UpdateAllowUnknown(Args{{"max_bin", "64"}});
-
-      updater->Configure(Args{});
-      std::vector<HostDeviceVector<bst_node_t>> position(1);
-      updater->Update(&param, &gpair, Xy_.get(), position, trees);
-      HostDeviceVector<float> out_prediction_cached;
-      out_prediction_cached.SetDevice(ctx->Device());
-      out_prediction_cached.Resize(n_samples_);
-      auto cache =
-          linalg::MakeTensorView(ctx, &out_prediction_cached, out_prediction_cached.Size(), 1);
-      ASSERT_TRUE(updater->UpdatePredictionCache(Xy_.get(), cache));
-    }
-
-    for (auto policy : {"depthwise", "lossguide"}) {
-      for (auto subsample : {1.0f, 0.4f}) {
-        this->RunLearnerTest(ctx, updater_name, subsample, policy, strategy);
-        this->RunLearnerTest(ctx, updater_name, subsample, policy, strategy);
-      }
-    }
-  }
-};
-
 TEST_F(TestPredictionCache, Approx) {
   Context ctx;
   this->RunTest(&ctx, "grow_histmaker", "one_output_per_tree");
@@ -119,4 +32,4 @@ TEST_F(TestPredictionCache, GpuApprox) {
   this->RunTest(&ctx, "grow_gpu_approx", "one_output_per_tree");
 }
 #endif  // defined(XGBOOST_USE_CUDA)
-}  // namespace xgboost
+}  // namespace xgboost
\ No newline at end of file
diff --git a/tests/cpp/tree/test_prediction_cache.h b/tests/cpp/tree/test_prediction_cache.h
new file mode 100644
index 000000000000..a92c3023710a
--- /dev/null
+++ b/tests/cpp/tree/test_prediction_cache.h
@@ -0,0 +1,97 @@
+/**
+ * Copyright 2021-2024 by XGBoost contributors.
+ */
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include <xgboost/host_device_vector.h>
+#include <xgboost/tree_updater.h>
+
+#include <memory>
+
+#include "../../../src/tree/param.h"  // for TrainParam
+#include "../helpers.h"
+#include "xgboost/task.h"             // for ObjInfo
+
+namespace xgboost {
+class TestPredictionCache : public ::testing::Test {
+  std::shared_ptr<DMatrix> Xy_;
+  std::size_t n_samples_{2048};
+
+ protected:
+  void SetUp() override {
+    std::size_t n_features = 13;
+    bst_target_t n_targets = 3;
+    Xy_ = RandomDataGenerator{n_samples_, n_features, 0}.Targets(n_targets).GenerateDMatrix(true);
+  }
+
+  void RunLearnerTest(Context const* ctx, std::string updater_name, float subsample,
+                      std::string const& grow_policy, std::string const& strategy) {
+    std::unique_ptr<Learner> learner{Learner::Create({Xy_})};
+    learner->SetParam("device", ctx->DeviceName());
+    learner->SetParam("updater", updater_name);
+    learner->SetParam("multi_strategy", strategy);
+    learner->SetParam("grow_policy", grow_policy);
+    learner->SetParam("subsample", std::to_string(subsample));
+    learner->SetParam("nthread", "0");
+    learner->Configure();
+
+    for (size_t i = 0; i < 8; ++i) {
+      learner->UpdateOneIter(i, Xy_);
+    }
+
+    HostDeviceVector<float> out_prediction_cached;
+    learner->Predict(Xy_, false, &out_prediction_cached, 0, 0);
+
+    Json model{Object()};
+    learner->SaveModel(&model);
+
+    HostDeviceVector<float> out_prediction;
+    {
+      std::unique_ptr<Learner> learner{Learner::Create({Xy_})};
+      learner->LoadModel(model);
+      learner->Predict(Xy_, false, &out_prediction, 0, 0);
+    }
+
+    auto const h_predt_cached = out_prediction_cached.ConstHostSpan();
+    auto const h_predt = out_prediction.ConstHostSpan();
+
+    ASSERT_EQ(h_predt.size(), h_predt_cached.size());
+    for (size_t i = 0; i < h_predt.size(); ++i) {
+      ASSERT_NEAR(h_predt[i], h_predt_cached[i], kRtEps);
+    }
+  }
+
+  void RunTest(Context* ctx, std::string const& updater_name, std::string const& strategy) {
+    {
+      ctx->InitAllowUnknown(Args{{"nthread", "8"}});
+
+      ObjInfo task{ObjInfo::kRegression};
+      std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create(updater_name, ctx, &task)};
+      RegTree tree;
+      std::vector<RegTree*> trees{&tree};
+      auto gpair = GenerateRandomGradients(ctx, n_samples_, 1);
+      tree::TrainParam param;
+      param.UpdateAllowUnknown(Args{{"max_bin", "64"}});
+
+      updater->Configure(Args{});
+      std::vector<HostDeviceVector<bst_node_t>> position(1);
+      updater->Update(&param, &gpair, Xy_.get(), position, trees);
+      HostDeviceVector<float> out_prediction_cached;
+      out_prediction_cached.SetDevice(ctx->Device());
+      out_prediction_cached.Resize(n_samples_);
+      auto cache =
+          linalg::MakeTensorView(ctx, &out_prediction_cached, out_prediction_cached.Size(), 1);
+      ASSERT_TRUE(updater->UpdatePredictionCache(Xy_.get(), cache));
+    }
+
+    for (auto policy : {"depthwise", "lossguide"}) {
+      for (auto subsample : {1.0f, 0.4f}) {
+        this->RunLearnerTest(ctx, updater_name, subsample, policy, strategy);
+        this->RunLearnerTest(ctx, updater_name, subsample, policy, strategy);
+      }
+    }
+  }
+};
+}  // namespace xgboost
diff --git a/tests/python-sycl/test_sycl_training_continuation.py b/tests/python-sycl/test_sycl_training_continuation.py
new file mode 100644
index 000000000000..e2a11c987bb4
--- /dev/null
+++ b/tests/python-sycl/test_sycl_training_continuation.py
@@ -0,0 +1,59 @@
+import numpy as np
+import xgboost as xgb
+import json
+
+rng = np.random.RandomState(1994)
+
+
+class TestSYCLTrainingContinuation:
+    def run_training_continuation(self, use_json):
+        kRows = 64
+        kCols = 32
+        X = np.random.randn(kRows, kCols)
+        y = np.random.randn(kRows)
+        dtrain = xgb.DMatrix(X, y)
+        params = {
+            "device": "sycl",
+            "max_depth": "2",
+            "gamma": "0.1",
+            "alpha": "0.01",
+            "enable_experimental_json_serialization": use_json,
+        }
+        bst_0 = xgb.train(params, dtrain, num_boost_round=64)
+        dump_0 = bst_0.get_dump(dump_format="json")
+
+        bst_1 = xgb.train(params, dtrain, num_boost_round=32)
+        bst_1 = xgb.train(params, dtrain, num_boost_round=32, xgb_model=bst_1)
+        dump_1 = bst_1.get_dump(dump_format="json")
+
+        def recursive_compare(obj_0, obj_1):
+            if isinstance(obj_0, float):
+                assert np.isclose(obj_0, obj_1, atol=1e-6)
+            elif isinstance(obj_0, str):
+                assert obj_0 == obj_1
+            elif isinstance(obj_0, int):
+                assert obj_0 == obj_1
+            elif isinstance(obj_0, dict):
+                keys_0 = list(obj_0.keys())
+                keys_1 = list(obj_1.keys())
+                values_0 = list(obj_0.values())
+                values_1 = list(obj_1.values())
+                for i in range(len(obj_0.items())):
+                    assert keys_0[i] == keys_1[i]
+                    if list(obj_0.keys())[i] != "missing":
+                        recursive_compare(values_0[i], values_1[i])
+            else:
+                for i in range(len(obj_0)):
+                    recursive_compare(obj_0[i], obj_1[i])
+
+        assert len(dump_0) == len(dump_1)
+        for i in range(len(dump_0)):
+            obj_0 = json.loads(dump_0[i])
+            obj_1 = json.loads(dump_1[i])
+            recursive_compare(obj_0, obj_1)
+
+    def test_sycl_training_continuation_binary(self):
+        self.run_training_continuation(False)
+
+    def test_sycl_training_continuation_json(self):
+        self.run_training_continuation(True)
diff --git a/tests/python-sycl/test_sycl_updaters.py b/tests/python-sycl/test_sycl_updaters.py
new file mode 100644
index 000000000000..57ca8d783bd7
--- /dev/null
+++ b/tests/python-sycl/test_sycl_updaters.py
@@ -0,0 +1,80 @@
+import numpy as np
+import gc
+import pytest
+import xgboost as xgb
+from hypothesis import given, strategies, assume, settings, note
+
+import sys
+import os
+
+# sys.path.append("tests/python")
+# import testing as tm
+from xgboost import testing as tm
+
+parameter_strategy = strategies.fixed_dictionaries(
+    {
+        "max_depth": strategies.integers(0, 11),
+        "max_leaves": strategies.integers(0, 256),
+        "max_bin": strategies.integers(2, 1024),
+        "grow_policy": strategies.sampled_from(["lossguide", "depthwise"]),
+        "single_precision_histogram": strategies.booleans(),
+        "min_child_weight": strategies.floats(0.5, 2.0),
+        "seed": strategies.integers(0, 10),
+        # We cannot enable subsampling as the training loss can increase
+        # 'subsample': strategies.floats(0.5, 1.0),
+        "colsample_bytree": strategies.floats(0.5, 1.0),
+        "colsample_bylevel": strategies.floats(0.5, 1.0),
+    }
+).filter(
+    lambda x: (x["max_depth"] > 0 or x["max_leaves"] > 0)
+    and (x["max_depth"] > 0 or x["grow_policy"] == "lossguide")
+)
+
+
+def train_result(param, dmat, num_rounds):
+    result = {}
+    xgb.train(
+        param,
+        dmat,
+        num_rounds,
+        [(dmat, "train")],
+        verbose_eval=False,
+        evals_result=result,
+    )
+    return result
+
+
+class TestSYCLUpdaters:
+    @given(parameter_strategy, strategies.integers(1, 5), tm.make_dataset_strategy())
+    @settings(deadline=None)
+    def test_sycl_hist(self, param, num_rounds, dataset):
+        param["tree_method"] = "hist"
+        param["device"] = "sycl"
+        param["verbosity"] = 0
+        param = dataset.set_params(param)
+        result = train_result(param, dataset.get_dmat(), num_rounds)
+        note(result)
+        assert tm.non_increasing(result["train"][dataset.metric])
+
+    @given(tm.make_dataset_strategy(), strategies.integers(0, 1))
+    @settings(deadline=None)
+    def test_specified_device_id_sycl_update(self, dataset, device_id):
+        # Read the list of sycl-devicese
+        sycl_ls = os.popen("sycl-ls").read()
+        devices = sycl_ls.split("\n")
+
+        # Test should launch only on gpu
+        # Find gpus in the list of devices
+        # and use the id in the list insteard of device_id
+        target_device_type = "opencl:gpu"
+        found_devices = 0
+        for idx in range(len(devices)):
+            if len(devices[idx]) >= len(target_device_type):
+                if devices[idx][1 : 1 + len(target_device_type)] == target_device_type:
+                    if found_devices == device_id:
+                        param = {"device": f"sycl:gpu:{idx}"}
+                        param = dataset.set_params(param)
+                        result = train_result(param, dataset.get_dmat(), 10)
+                        assert tm.non_increasing(result["train"][dataset.metric])
+                    else:
+                        found_devices += 1

From 03bd1183bcf5b8294e0fb79f242c78a2ec71808b Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Wed, 21 Aug 2024 13:16:55 -0500
Subject: [PATCH 17/19] [doc] prefer 'cmake -B' and 'cmake --build' everywhere
 (#10717)

---
 demo/rmm_plugin/README.rst       |  8 ++--
 doc/build.rst                    | 68 +++++++++++---------------------
 doc/contrib/unit_tests.rst       |  7 ++--
 doc/tutorials/c_api_tutorial.rst | 12 ++----
 plugin/sycl/README.md            |  8 ++--
 5 files changed, 37 insertions(+), 66 deletions(-)

diff --git a/demo/rmm_plugin/README.rst b/demo/rmm_plugin/README.rst
index 809d7aebd22b..c7cff09b52aa 100644
--- a/demo/rmm_plugin/README.rst
+++ b/demo/rmm_plugin/README.rst
@@ -18,8 +18,8 @@ run CMake with option ``-DPLUGIN_RMM=ON`` (``-DUSE_CUDA=ON`` also required):
 
 .. code-block:: sh
 
-  cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON
-  make -j$(nproc)
+  cmake -B build -S . -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON
+  cmake --build build -j$(nproc)
 
 CMake will attempt to locate the RMM library in your build environment. You may choose to build
 RMM from the source, or install it using the Conda package manager. If CMake cannot find RMM, you
@@ -28,9 +28,9 @@ should specify the location of RMM with the CMake prefix:
 .. code-block:: sh
 
   # If using Conda:
-  cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
+  cmake -B build -S . -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
   # If using RMM installed with a custom location
-  cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm
+  cmake -B build -S . -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm
 
 ********************************
 Informing XGBoost about RMM pool
diff --git a/doc/build.rst b/doc/build.rst
index fda64820ad04..6be864587818 100644
--- a/doc/build.rst
+++ b/doc/build.rst
@@ -68,10 +68,8 @@ After obtaining the source code, one builds XGBoost by running CMake:
 .. code-block:: bash
 
   cd xgboost
-  mkdir build
-  cd build
-  cmake ..
-  make -j$(nproc)
+  cmake -B build -S .
+  cmake --build build -j$(nproc)
 
 Building on MacOS
 =================
@@ -94,12 +92,10 @@ following from the root of the XGBoost directory:
 
 .. code-block:: bash
 
-  mkdir build
-  cd build
-  cmake .. -G"Visual Studio 14 2015 Win64"
-  # for VS15: cmake .. -G"Visual Studio 15 2017" -A x64
-  # for VS16: cmake .. -G"Visual Studio 16 2019" -A x64
-  cmake --build . --config Release
+  cmake -B build -S . -G"Visual Studio 14 2015 Win64"
+  # for VS15: cmake -B build -S . -G"Visual Studio 15 2017" -A x64
+  # for VS16: cmake -B build -S . -G"Visual Studio 16 2019" -A x64
+  cmake --build build --config Release
 
 This specifies an out of source build using the Visual Studio 64 bit generator. (Change the ``-G`` option appropriately if you have a different version of Visual Studio installed.)
 
@@ -127,10 +123,8 @@ From the command line on Linux starting from the XGBoost directory:
 
 .. code-block:: bash
 
-  mkdir build
-  cd build
-  cmake .. -DUSE_CUDA=ON
-  make -j4
+  cmake -B build -S . -DUSE_CUDA=ON
+  cmake --build build -j4
 
 .. note:: Specifying compute capability
 
@@ -142,10 +136,8 @@ From the command line on Linux starting from the XGBoost directory:
 
   .. code-block:: bash
 
-    mkdir build
-    cd build
-    cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DNCCL_ROOT=/path/to/nccl2
-    make -j4
+    cmake -B build -S . -DUSE_CUDA=ON -DUSE_NCCL=ON -DNCCL_ROOT=/path/to/nccl2
+    cmake --build build -j4
 
 Some additional flags are available for NCCL, ``BUILD_WITH_SHARED_NCCL`` enables building XGBoost with NCCL as a shared library, while ``USE_DLOPEN_NCCL`` enables XGBoost  to load NCCL at runtime using ``dlopen``.
 
@@ -153,9 +145,7 @@ On Windows, run CMake as follows:
 
 .. code-block:: bash
 
-  mkdir build
-  cd build
-  cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON
+  cmake -B build -S . -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON
 
 (Change the ``-G`` option appropriately if you have a different version of Visual Studio installed.)
 
@@ -163,7 +153,7 @@ The above cmake configuration run will create an ``xgboost.sln`` solution file i
 
 .. code-block:: bash
 
-  cmake --build . --target xgboost --config Release
+  cmake --buildbuild. --target xgboost --config Release
 
 To speed up compilation, run multiple jobs in parallel by appending option ``-- /MP``.
 
@@ -250,14 +240,11 @@ There are several ways to build and install the package from source:
 
   .. code-block:: bash
 
-    # Under xgboost source directory
-    mkdir build
-    cd build
     # Build shared library libxgboost.so
-    cmake .. -GNinja
-    ninja
+    cmake -B build -S . -GNinja
+    cmake --build build
     # Install as editable installation
-    cd ../python-package
+    cd ./python-package
     pip install -e .
 
 4. Use ``libxgboost.so`` on system path.
@@ -336,11 +323,8 @@ above snippet can be replaced by:
 
 .. code-block:: bash
 
-  mkdir build
-  cd build
-  cmake .. -DR_LIB=ON
-  make -j$(nproc)
-  make install
+  cmake -B build -S . -DR_LIB=ON
+  cmake --build build --target install -j$(nproc)
 
 
 Installing the development version with Visual Studio (Windows)
@@ -368,10 +352,8 @@ Open the Command Prompt and navigate to the XGBoost directory, and then run the
 .. code-block:: bash
 
   cd C:\path\to\xgboost
-  mkdir build
-  cd build
-  cmake .. -G"Visual Studio 16 2019" -A x64 -DR_LIB=ON -DR_VERSION=4.0.0
-  cmake --build . --target install --config Release
+  cmake -B build -S . -G"Visual Studio 16 2019" -A x64 -DR_LIB=ON -DR_VERSION=4.0.0
+  cmake --build build --target install --config Release
 
 
 .. _r_gpu_support:
@@ -385,10 +367,8 @@ On Linux, starting from the XGBoost directory type:
 
 .. code-block:: bash
 
-  mkdir build
-  cd build
-  cmake .. -DUSE_CUDA=ON -DR_LIB=ON
-  make install -j$(nproc)
+  cmake -B build -S . -DUSE_CUDA=ON -DR_LIB=ON
+  cmake --build build --target install -j$(nproc)
 
 When default target is used, an R package shared library would be built in the ``build`` area.
 The ``install`` target, in addition, assembles the package files with this shared library under ``build/R-package`` and runs ``R CMD INSTALL``.
@@ -413,10 +393,8 @@ Open the Command Prompt and navigate to the XGBoost directory, and then run the
 .. code-block:: bash
 
   cd C:\path\to\xgboost
-  mkdir build
-  cd build
-  cmake .. -G"Visual Studio 16 2019" -A x64 -DUSE_CUDA=ON -DR_LIB=ON -DR_VERSION=4.0.0
-  cmake --build . --target install --config Release
+  cmake -B build -S . -G"Visual Studio 16 2019" -A x64 -DUSE_CUDA=ON -DR_LIB=ON -DR_VERSION=4.0.0
+  cmake --build build --target install --config Release
 
 If CMake can't find your R during the configuration step, you might provide the location of R to CMake like this: ``-DLIBR_HOME="C:\Program Files\R\R-4.0.0"``.
 
diff --git a/doc/contrib/unit_tests.rst b/doc/contrib/unit_tests.rst
index 908e5ed99fa9..aa58cd337020 100644
--- a/doc/contrib/unit_tests.rst
+++ b/doc/contrib/unit_tests.rst
@@ -130,10 +130,9 @@ To build and run C++ unit tests enable tests while running CMake:
 
 .. code-block:: bash
 
-  mkdir build
-  cd build
-  cmake -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_CUDA=ON -DUSE_NCCL=ON ..
-  ninja
+  cmake -B build -S . -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_CUDA=ON -DUSE_NCCL=ON
+  cmake --build build
+  cd ./build
   ./testxgboost
 
 Flags like ``USE_CUDA``, ``USE_DMLC_GTEST`` are optional. For more info about how to build
diff --git a/doc/tutorials/c_api_tutorial.rst b/doc/tutorials/c_api_tutorial.rst
index bb1db82496ff..2346ff9ac9d8 100644
--- a/doc/tutorials/c_api_tutorial.rst
+++ b/doc/tutorials/c_api_tutorial.rst
@@ -26,14 +26,12 @@ Run the following commands on your terminal. The below commands will install the
     # clone the XGBoost repository & its submodules
     git clone --recursive https://github.com/dmlc/xgboost
     cd xgboost
-    mkdir build
-    cd build
     # Activate the Conda environment, into which we'll install XGBoost
     conda activate [env_name]
     # Build the compiled version of XGBoost inside the build folder
-    cmake .. -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
+    cmake -B build -S . -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
     # install XGBoost in your conda environment (usually under [your home directory]/miniconda3)
-    make install
+    cmake --build build --target install
 
 *********************************************************************
 Configure CMakeList.txt file of your application to link with XGBoost
@@ -55,14 +53,12 @@ To ensure that CMake can locate the XGBoost library, supply ``-DCMAKE_PREFIX_PAT
 
 .. code-block:: bash
 
-  # Navigate to the build directory for your application
-  cd build
   # Activate the Conda environment where we previously installed XGBoost
   conda activate [env_name]
   # Invoke CMake with CMAKE_PREFIX_PATH
-  cmake .. -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
+  cmake -B build -S . -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
   # Build your application
-  make
+  cmake --build build
 
 ************************
 Useful Tips To Remember
diff --git a/plugin/sycl/README.md b/plugin/sycl/README.md
index b5dc07a1a490..98ccd48c0368 100755
--- a/plugin/sycl/README.md
+++ b/plugin/sycl/README.md
@@ -33,8 +33,6 @@ See also [Intel® oneAPI Programming Guide](https://www.intel.com/content/www/us
 From the ``xgboost`` directory, run:
 
 ```bash
-$ mkdir build
-$ cd build
-$ cmake .. -DPLUGIN_SYCL=ON
-$ make -j
-```
\ No newline at end of file
+$ cmake -B build -S . -DPLUGIN_SYCL=ON
+$ cmake --build build -j
+```

From cb54374550002efa7e4f2279c8941b4c7c196188 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 22 Aug 2024 04:12:18 +0800
Subject: [PATCH 18/19] Update clang-tidy. (#10730)

- Install cmake using pip.
- Fix compile command generation.
- Clean up the tidy script and remove the need to load the yaml file.
- Fix modernized type traits.
- Fix span class. Polymorphism support is dropped
---
 include/xgboost/collective/socket.h        |   5 +-
 include/xgboost/host_device_vector.h       |   2 +-
 include/xgboost/json.h                     | 110 ++++----
 include/xgboost/json_io.h                  |   9 +-
 include/xgboost/linalg.h                   |  20 +-
 include/xgboost/parameter.h                |   2 +-
 include/xgboost/span.h                     |  58 ++--
 src/collective/in_memory_handler.cc        |   4 +-
 src/common/device_helpers.cuh              |   6 +-
 src/common/json.cc                         |  24 +-
 src/common/math.h                          |  29 +-
 src/common/numeric.cc                      |   8 +-
 src/common/numeric.h                       |  19 +-
 src/common/observer.h                      |   6 +-
 src/common/quantile.cu                     |   7 +-
 src/common/stats.cuh                       |  16 +-
 src/common/stats.h                         |   2 +-
 src/common/transform.h                     |   7 +-
 src/data/array_interface.h                 |  11 +-
 src/data/simple_dmatrix.cc                 |   6 +-
 src/objective/lambdarank_obj.cc            |   2 +-
 src/tree/constraints.cu                    |   2 +-
 src/tree/gpu_hist/evaluate_splits.cu       |   2 +-
 src/tree/io_utils.h                        |  20 +-
 src/tree/tree_model.cc                     |  10 +-
 tests/ci_build/Dockerfile.clang_tidy       |  18 +-
 tests/ci_build/lint_python.py              |   3 +
 tests/ci_build/tidy.py                     | 293 +++++++++++----------
 tests/cpp/common/test_intrusive_ptr.cc     |   9 +-
 tests/cpp/common/test_span.cc              |  14 +-
 tests/cpp/data/test_adapter.cc             |  10 +-
 tests/cpp/data/test_sparse_page_dmatrix.cc |   6 +-
 tests/cpp/data/test_sparse_page_dmatrix.cu |   2 +-
 tests/cpp/tree/test_quantile_hist.cc       |   8 +-
 34 files changed, 362 insertions(+), 388 deletions(-)

diff --git a/include/xgboost/collective/socket.h b/include/xgboost/collective/socket.h
index bf5fffdaf155..a025edddd409 100644
--- a/include/xgboost/collective/socket.h
+++ b/include/xgboost/collective/socket.h
@@ -686,8 +686,11 @@ class TCPSocket {
    * \return size of data actually received return -1 if error occurs
    */
   auto Recv(void *buf, std::size_t len, std::int32_t flags = 0) {
-    char *_buf = reinterpret_cast<char *>(buf);
+    char *_buf = static_cast<char *>(buf);
+    // See https://github.com/llvm/llvm-project/issues/104241 for skipped tidy analysis
+    // NOLINTBEGIN(clang-analyzer-unix.BlockInCriticalSection)
     return recv(handle_, _buf, len, flags);
+    // NOLINTEND(clang-analyzer-unix.BlockInCriticalSection)
   }
   /**
    * \brief Send string, format is matched with the Python socket wrapper in RABIT.
diff --git a/include/xgboost/host_device_vector.h b/include/xgboost/host_device_vector.h
index 36c7ed32b83d..d9dfeadbc7eb 100644
--- a/include/xgboost/host_device_vector.h
+++ b/include/xgboost/host_device_vector.h
@@ -85,7 +85,7 @@ enum GPUAccess {
 
 template <typename T>
 class HostDeviceVector {
-  static_assert(std::is_standard_layout<T>::value, "HostDeviceVector admits only POD types");
+  static_assert(std::is_standard_layout_v<T>, "HostDeviceVector admits only POD types");
 
  public:
   explicit HostDeviceVector(size_t size = 0, T v = T(), DeviceOrd device = DeviceOrd::CPU());
diff --git a/include/xgboost/json.h b/include/xgboost/json.h
index 1416b8899785..82bc56f76b52 100644
--- a/include/xgboost/json.h
+++ b/include/xgboost/json.h
@@ -11,9 +11,8 @@
 
 #include <functional>
 #include <map>
-#include <memory>
 #include <string>
-#include <type_traits>  // std::enable_if,std::enable_if_t
+#include <type_traits>  // std::enable_if_t
 #include <utility>
 #include <vector>
 
@@ -223,6 +222,14 @@ class JsonObject : public Value {
   ~JsonObject() override = default;
 };
 
+namespace detail {
+template <typename T, typename U>
+using IsSameT = std::enable_if_t<std::is_same_v<std::remove_cv_t<T>, std::remove_cv_t<U>>>;
+
+template <typename T>
+using IsF64T = std::enable_if_t<std::is_same_v<T, double>>;
+}  // namespace detail
+
 class JsonNumber : public Value {
  public:
   using Float = float;
@@ -232,15 +239,11 @@ class JsonNumber : public Value {
 
  public:
   JsonNumber() : Value(ValueKind::kNumber) {}
-  template <typename FloatT,
-            typename std::enable_if<std::is_same<FloatT, Float>::value>::type* = nullptr>
-  JsonNumber(FloatT value) : Value(ValueKind::kNumber) {  // NOLINT
-    number_ = value;
-  }
-  template <typename FloatT,
-            typename std::enable_if<std::is_same<FloatT, double>::value>::type* = nullptr>
-  JsonNumber(FloatT value) : Value{ValueKind::kNumber},  // NOLINT
-                             number_{static_cast<Float>(value)} {}
+  template <typename FloatT, typename detail::IsSameT<FloatT, Float>* = nullptr>
+  JsonNumber(FloatT value) : Value(ValueKind::kNumber), number_{value} {}  // NOLINT
+  template <typename FloatT, typename detail::IsF64T<FloatT>* = nullptr>
+  JsonNumber(FloatT value)  // NOLINT
+      : Value{ValueKind::kNumber}, number_{static_cast<Float>(value)} {}
   JsonNumber(JsonNumber const& that) = delete;
   JsonNumber(JsonNumber&& that) noexcept : Value{ValueKind::kNumber}, number_{that.number_} {}
 
@@ -258,6 +261,13 @@ class JsonNumber : public Value {
   }
 };
 
+namespace detail {
+template <typename IntT>
+using Not32SizeT = std::enable_if_t<std::is_same_v<IntT, std::uint32_t> &&
+                                    !std::is_same_v<std::size_t, std::uint32_t>>;
+}
+
+
 class JsonInteger : public Value {
  public:
   using Int = int64_t;
@@ -267,24 +277,18 @@ class JsonInteger : public Value {
 
  public:
   JsonInteger() : Value(ValueKind::kInteger) {}  // NOLINT
+  template <typename IntT, typename detail::IsSameT<IntT, Int>* = nullptr>
+  JsonInteger(IntT value) : Value(ValueKind::kInteger), integer_{value} {}  // NOLINT
+  template <typename IntT, typename detail::IsSameT<IntT, std::size_t>* = nullptr>
+  JsonInteger(IntT value)  // NOLINT
+      : Value(ValueKind::kInteger), integer_{static_cast<Int>(value)} {}
+  template <typename IntT, typename detail::IsSameT<IntT, std::int32_t>* = nullptr>
+  JsonInteger(IntT value)  // NOLINT
+      : Value(ValueKind::kInteger), integer_{static_cast<Int>(value)} {}
   template <typename IntT,
-            typename std::enable_if<std::is_same<IntT, Int>::value>::type* = nullptr>
-  JsonInteger(IntT value) : Value(ValueKind::kInteger), integer_{value} {} // NOLINT
-  template <typename IntT,
-            typename std::enable_if<std::is_same<IntT, size_t>::value>::type* = nullptr>
-  JsonInteger(IntT value) : Value(ValueKind::kInteger),  // NOLINT
-                            integer_{static_cast<Int>(value)} {}
-  template <typename IntT,
-            typename std::enable_if<std::is_same<IntT, int32_t>::value>::type* = nullptr>
-  JsonInteger(IntT value) : Value(ValueKind::kInteger),  // NOLINT
-                            integer_{static_cast<Int>(value)} {}
-  template <typename IntT,
-            typename std::enable_if<
-                std::is_same<IntT, uint32_t>::value &&
-                !std::is_same<std::size_t, uint32_t>::value>::type * = nullptr>
+            typename detail::Not32SizeT<IntT>* = nullptr>
   JsonInteger(IntT value)  // NOLINT
-      : Value(ValueKind::kInteger),
-        integer_{static_cast<Int>(value)} {}
+      : Value(ValueKind::kInteger), integer_{static_cast<Int>(value)} {}
 
   JsonInteger(JsonInteger &&that) noexcept
       : Value{ValueKind::kInteger}, integer_{that.integer_} {}
@@ -325,12 +329,8 @@ class JsonBoolean : public Value {
  public:
   JsonBoolean() : Value(ValueKind::kBoolean) {}  // NOLINT
   // Ambigious with JsonNumber.
-  template <typename Bool,
-            typename std::enable_if<
-              std::is_same<Bool, bool>::value ||
-              std::is_same<Bool, bool const>::value>::type* = nullptr>
-  JsonBoolean(Bool value) :  // NOLINT
-      Value(ValueKind::kBoolean), boolean_{value} {}
+  template <typename Bool, typename detail::IsSameT<std::remove_cv_t<Bool>, bool>* = nullptr>
+  JsonBoolean(Bool value) : Value(ValueKind::kBoolean), boolean_{value} {}  // NOLINT
   JsonBoolean(JsonBoolean&& value) noexcept:  // NOLINT
       Value(ValueKind::kBoolean), boolean_{value.boolean_} {}
 
@@ -506,71 +506,52 @@ bool IsA(Json const& j) {
 
 namespace detail {
 // Number
-template <typename T,
-          typename std::enable_if<
-            std::is_same<T, JsonNumber>::value>::type* = nullptr>
+template <typename T, typename std::enable_if_t<std::is_same_v<T, JsonNumber>>* = nullptr>
 JsonNumber::Float& GetImpl(T& val) {  // NOLINT
   return val.GetNumber();
 }
-template <typename T,
-          typename std::enable_if<
-            std::is_same<T, JsonNumber const>::value>::type* = nullptr>
+template <typename T, typename std::enable_if_t<std::is_same_v<T, JsonNumber const>>* = nullptr>
 JsonNumber::Float const& GetImpl(T& val) {  // NOLINT
   return val.GetNumber();
 }
 
 // Integer
-template <typename T,
-          typename std::enable_if<
-            std::is_same<T, JsonInteger>::value>::type* = nullptr>
+template <typename T, typename std::enable_if_t<std::is_same_v<T, JsonInteger>>* = nullptr>
 JsonInteger::Int& GetImpl(T& val) {  // NOLINT
   return val.GetInteger();
 }
-template <typename T,
-          typename std::enable_if<
-            std::is_same<T, JsonInteger const>::value>::type* = nullptr>
+template <typename T, typename std::enable_if_t<std::is_same_v<T, JsonInteger const>>* = nullptr>
 JsonInteger::Int const& GetImpl(T& val) {  // NOLINT
   return val.GetInteger();
 }
 
 // String
-template <typename T,
-          typename std::enable_if<
-            std::is_same<T, JsonString>::value>::type* = nullptr>
+template <typename T, typename std::enable_if_t<std::is_same_v<T, JsonString>>* = nullptr>
 std::string& GetImpl(T& val) {  // NOLINT
   return val.GetString();
 }
-template <typename T,
-          typename std::enable_if<
-            std::is_same<T, JsonString const>::value>::type* = nullptr>
+template <typename T, typename std::enable_if_t<std::is_same_v<T, JsonString const>>* = nullptr>
 std::string const& GetImpl(T& val) {  // NOLINT
   return val.GetString();
 }
 
 // Boolean
-template <typename T,
-          typename std::enable_if<
-            std::is_same<T, JsonBoolean>::value>::type* = nullptr>
+template <typename T, typename std::enable_if_t<std::is_same_v<T, JsonBoolean>>* = nullptr>
 bool& GetImpl(T& val) {  // NOLINT
   return val.GetBoolean();
 }
 template <typename T,
-          typename std::enable_if<
-            std::is_same<T, JsonBoolean const>::value>::type* = nullptr>
+          typename std::enable_if_t<std::is_same_v<T, JsonBoolean const>>* = nullptr>
 bool const& GetImpl(T& val) {  // NOLINT
   return val.GetBoolean();
 }
 
 // Array
-template <typename T,
-          typename std::enable_if<
-            std::is_same<T, JsonArray>::value>::type* = nullptr>
+template <typename T, typename std::enable_if_t<std::is_same_v<T, JsonArray>>* = nullptr>
 std::vector<Json>& GetImpl(T& val) {  // NOLINT
   return val.GetArray();
 }
-template <typename T,
-          typename std::enable_if<
-            std::is_same<T, JsonArray const>::value>::type* = nullptr>
+template <typename T, typename std::enable_if_t<std::is_same_v<T, JsonArray const>>* = nullptr>
 std::vector<Json> const& GetImpl(T& val) {  // NOLINT
   return val.GetArray();
 }
@@ -586,12 +567,11 @@ std::vector<T> const& GetImpl(JsonTypedArray<T, kind> const& val) {
 }
 
 // Object
-template <typename T, typename std::enable_if<std::is_same<T, JsonObject>::value>::type* = nullptr>
+template <typename T, typename std::enable_if_t<std::is_same_v<T, JsonObject>>* = nullptr>
 JsonObject::Map& GetImpl(T& val) {  // NOLINT
   return val.GetObject();
 }
-template <typename T,
-          typename std::enable_if<std::is_same<T, JsonObject const>::value>::type* = nullptr>
+template <typename T, typename std::enable_if_t<std::is_same_v<T, JsonObject const>>* = nullptr>
 JsonObject::Map const& GetImpl(T& val) {  // NOLINT
   return val.GetObject();
 }
diff --git a/include/xgboost/json_io.h b/include/xgboost/json_io.h
index ce3d25c37e19..57f8005ab777 100644
--- a/include/xgboost/json_io.h
+++ b/include/xgboost/json_io.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2023, XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #ifndef XGBOOST_JSON_IO_H_
 #define XGBOOST_JSON_IO_H_
@@ -7,11 +7,8 @@
 #include <xgboost/base.h>
 #include <xgboost/json.h>
 
-#include <cinttypes>
+#include <cstdint>  // for int8_t
 #include <limits>
-#include <map>
-#include <memory>
-#include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
@@ -111,7 +108,7 @@ class JsonReader {
 };
 
 class JsonWriter {
-  template <typename T, std::enable_if_t<!std::is_same<Json, T>::value>* = nullptr>
+  template <typename T, std::enable_if_t<!std::is_same_v<Json, T>>* = nullptr>
   void Save(T const& v) {
     this->Save(Json{v});
   }
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 553486dac330..384ba0942a90 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -43,9 +43,9 @@ namespace detail {
 struct ArrayInterfaceHandler {
   template <typename T>
   static constexpr char TypeChar() {
-    return (std::is_floating_point<T>::value
+    return (std::is_floating_point_v<T>
                 ? 'f'
-                : (std::is_integral<T>::value ? (std::is_signed<T>::value ? 'i' : 'u') : '\0'));
+                : (std::is_integral_v<T> ? (std::is_signed_v<T> ? 'i' : 'u') : '\0'));
   }
 };
 
@@ -93,7 +93,7 @@ struct RangeTag {
  */
 template <typename T>
 constexpr int32_t CalcSliceDim() {
-  return std::is_same<T, IntTag>::value ? 0 : 1;
+  return std::is_same_v<T, IntTag> ? 0 : 1;
 }
 
 template <typename T, typename... S>
@@ -114,7 +114,7 @@ template <typename S>
 using RemoveCRType = std::remove_const_t<std::remove_reference_t<S>>;
 
 template <typename S>
-using IndexToTag = std::conditional_t<std::is_integral<RemoveCRType<S>>::value, IntTag, S>;
+using IndexToTag = std::conditional_t<std::is_integral_v<RemoveCRType<S>>, IntTag, S>;
 
 template <int32_t n, typename Fn>
 LINALG_HD constexpr auto UnrollLoop(Fn fn) {
@@ -159,7 +159,7 @@ inline LINALG_HD int Popc(uint64_t v) {
 
 template <std::size_t D, typename Head>
 LINALG_HD void IndexToArr(std::size_t (&arr)[D], Head head) {
-  static_assert(std::is_integral<std::remove_reference_t<Head>>::value, "Invalid index type.");
+  static_assert(std::is_integral_v<std::remove_reference_t<Head>>, "Invalid index type.");
   arr[D - 1] = head;
 }
 
@@ -169,7 +169,7 @@ LINALG_HD void IndexToArr(std::size_t (&arr)[D], Head head) {
 template <std::size_t D, typename Head, typename... Rest>
 LINALG_HD void IndexToArr(std::size_t (&arr)[D], Head head, Rest &&...index) {
   static_assert(sizeof...(Rest) < D, "Index overflow.");
-  static_assert(std::is_integral<std::remove_reference_t<Head>>::value, "Invalid index type.");
+  static_assert(std::is_integral_v<std::remove_reference_t<Head>>, "Invalid index type.");
   arr[D - sizeof...(Rest) - 1] = head;
   IndexToArr(arr, std::forward<Rest>(index)...);
 }
@@ -193,7 +193,7 @@ constexpr auto ArrToTuple(T (&arr)[N]) {
 template <typename I, std::int32_t D>
 LINALG_HD auto UnravelImpl(I idx, common::Span<size_t const, D> shape) {
   std::size_t index[D]{0};
-  static_assert(std::is_signed<decltype(D)>::value,
+  static_assert(std::is_signed_v<decltype(D)>,
                 "Don't change the type without changing the for loop.");
   auto const sptr = shape.data();
   for (int32_t dim = D; --dim > 0;) {
@@ -379,7 +379,7 @@ class TensorView {
    * \brief Slice dimension for Index tag.
    */
   template <size_t old_dim, size_t new_dim, int32_t D, typename Index, typename... S>
-  LINALG_HD std::enable_if_t<std::is_integral<Index>::value, size_t> MakeSliceDim(
+  LINALG_HD std::enable_if_t<std::is_integral_v<Index>, size_t> MakeSliceDim(
       size_t new_shape[D], size_t new_stride[D], Index i, S &&...slices) const {
     static_assert(old_dim < kDim);
     auto offset = stride_[old_dim] * i;
@@ -547,7 +547,7 @@ class TensorView {
    */
   [[nodiscard]] LINALG_HD bool CContiguous() const {
     StrideT stride;
-    static_assert(std::is_same<decltype(stride), decltype(stride_)>::value);
+    static_assert(std::is_same_v<decltype(stride), decltype(stride_)>);
     // It's contiguous if the stride can be calculated from shape.
     detail::CalcStride(shape_, stride);
     return common::Span<size_t const, kDim>{stride_} == common::Span<size_t const, kDim>{stride};
@@ -557,7 +557,7 @@ class TensorView {
    */
   [[nodiscard]] LINALG_HD bool FContiguous() const {
     StrideT stride;
-    static_assert(std::is_same<decltype(stride), decltype(stride_)>::value);
+    static_assert(std::is_same_v<decltype(stride), decltype(stride_)>);
     // It's contiguous if the stride can be calculated from shape.
     detail::CalcStride<kDim, true>(shape_, stride);
     return common::Span<size_t const, kDim>{stride_} == common::Span<size_t const, kDim>{stride};
diff --git a/include/xgboost/parameter.h b/include/xgboost/parameter.h
index 063f1aca7cd0..d8188c030360 100644
--- a/include/xgboost/parameter.h
+++ b/include/xgboost/parameter.h
@@ -55,7 +55,7 @@ class FieldEntry<EnumClass> : public FieldEntry<int> {  \
  public:  \
   FieldEntry() {  \
     static_assert(  \
-      std::is_same<int, typename std::underlying_type<EnumClass>::type>::value,  \
+      std::is_same_v<int, typename std::underlying_type_t<EnumClass>>,  \
       "enum class must be backed by int");  \
     is_enum_ = true;  \
   }  \
diff --git a/include/xgboost/span.h b/include/xgboost/span.h
index 7471c2e44ed6..579737a59033 100644
--- a/include/xgboost/span.h
+++ b/include/xgboost/span.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2018-2023, XGBoost contributors
+ * Copyright 2018-2024, XGBoost contributors
  * \brief span class based on ISO++20 span
  *
  * About NOLINTs in this file:
@@ -129,9 +129,8 @@ namespace detail {
  *   represent ptrdiff_t, which is just int64_t. So we make it deterministic
  *   here.
  */
-using ptrdiff_t = typename std::conditional<  // NOLINT
-    std::is_same<std::ptrdiff_t, std::int64_t>::value,
-    std::ptrdiff_t, std::int64_t>::type;
+using ptrdiff_t = typename std::conditional_t<  // NOLINT
+    std::is_same_v<std::ptrdiff_t, std::int64_t>, std::ptrdiff_t, std::int64_t>;
 }  // namespace detail
 
 #if defined(_MSC_VER) && _MSC_VER < 1910
@@ -169,8 +168,8 @@ class SpanIterator {
                                            span_(_span), index_(_idx) {}
 
   friend SpanIterator<SpanType, true>;
-  template <bool B, typename std::enable_if<!B && IsConst>::type* = nullptr>
-  XGBOOST_DEVICE constexpr SpanIterator(                         // NOLINT
+  template <bool B, typename std::enable_if_t<!B && IsConst>* = nullptr>
+  XGBOOST_DEVICE constexpr SpanIterator(  // NOLINT
       const SpanIterator<SpanType, B>& other_) __span_noexcept
       : SpanIterator(other_.span_, other_.index_) {}
 
@@ -303,8 +302,8 @@ struct IsAllowedExtentConversion : public std::integral_constant<
   bool, From == To || From == dynamic_extent || To == dynamic_extent> {};
 
 template <class From, class To>
-struct IsAllowedElementTypeConversion : public std::integral_constant<
-  bool, std::is_convertible<From(*)[], To(*)[]>::value> {};
+struct IsAllowedElementTypeConversion
+    : public std::integral_constant<bool, std::is_convertible_v<From (*)[], To (*)[]>> {}; // NOLINT
 
 template <class T>
 struct IsSpanOracle : std::false_type {};
@@ -313,7 +312,7 @@ template <class T, std::size_t Extent>
 struct IsSpanOracle<Span<T, Extent>> : std::true_type {};
 
 template <class T>
-struct IsSpan : public IsSpanOracle<typename std::remove_cv<T>::type> {};
+struct IsSpan : public IsSpanOracle<typename std::remove_cv_t<T>> {};
 
 // Re-implement std algorithms here to adopt CUDA.
 template <typename T>
@@ -452,35 +451,34 @@ class Span {
       __span_noexcept : size_(N), data_(&arr[0]) {}
 
   template <class Container,
-            class = typename std::enable_if<
-              !std::is_const<element_type>::value &&
-              !detail::IsSpan<Container>::value &&
-              std::is_convertible<typename Container::pointer, pointer>::value &&
-              std::is_convertible<typename Container::pointer,
-                                  decltype(std::declval<Container>().data())>::value>::type>
-  Span(Container& _cont) :  // NOLINT
-      size_(_cont.size()), data_(_cont.data()) {
+            class = typename std::enable_if_t<
+                !std::is_const_v<element_type> && !detail::IsSpan<Container>::value &&
+                std::is_convertible_v<typename Container::pointer, pointer> &&
+                std::is_convertible_v<typename Container::pointer,
+                                      decltype(std::declval<Container>().data())>>>
+  Span(Container& _cont)  // NOLINT
+      : size_(_cont.size()), data_(_cont.data()) {
     static_assert(!detail::IsSpan<Container>::value, "Wrong constructor of Span is called.");
   }
 
   template <class Container,
-            class = typename std::enable_if<
-              std::is_const<element_type>::value &&
-              !detail::IsSpan<Container>::value &&
-              std::is_convertible<typename Container::pointer, pointer>::value &&
-              std::is_convertible<typename Container::pointer,
-                                  decltype(std::declval<Container>().data())>::value>::type>
-  Span(const Container& _cont) : size_(_cont.size()),  // NOLINT
-                                 data_(_cont.data()) {
+            class = typename std::enable_if_t<
+                std::is_const_v<element_type> && !detail::IsSpan<Container>::value &&
+                std::is_convertible_v<typename Container::pointer, pointer> &&
+                std::is_convertible_v<typename Container::pointer,
+                                      decltype(std::declval<Container>().data())>>>
+  Span(const Container& _cont)  // NOLINT
+      : size_(_cont.size()), data_(_cont.data()) {
     static_assert(!detail::IsSpan<Container>::value, "Wrong constructor of Span is called.");
   }
 
   template <class U, std::size_t OtherExtent,
-            class = typename std::enable_if<
-              detail::IsAllowedElementTypeConversion<U, T>::value &&
-              detail::IsAllowedExtentConversion<OtherExtent, Extent>::value>>
-  XGBOOST_DEVICE constexpr Span(const Span<U, OtherExtent>& _other)   // NOLINT
-      __span_noexcept : size_(_other.size()), data_(_other.data()) {}
+            class = typename std::enable_if_t<
+                detail::IsAllowedElementTypeConversion<U, T>::value &&
+                detail::IsAllowedExtentConversion<OtherExtent, Extent>::value>>
+  XGBOOST_DEVICE constexpr Span(const Span<U, OtherExtent>& _other)  // NOLINT
+      __span_noexcept : size_(_other.size()),
+                        data_(_other.data()) {}
 
   XGBOOST_DEVICE constexpr Span(const Span& _other)
       __span_noexcept : size_(_other.size()), data_(_other.data()) {}
diff --git a/src/collective/in_memory_handler.cc b/src/collective/in_memory_handler.cc
index 468f09c53048..37be3f9c7127 100644
--- a/src/collective/in_memory_handler.cc
+++ b/src/collective/in_memory_handler.cc
@@ -82,7 +82,7 @@ class AllreduceFunctor {
   }
 
  private:
-  template <class T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+  template <class T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
   void AccumulateBitwise(T* buffer, T const* input, std::size_t size, Op reduce_operation) const {
     switch (reduce_operation) {
       case Op::kBitwiseAND:
@@ -99,7 +99,7 @@ class AllreduceFunctor {
     }
   }
 
-  template <class T, std::enable_if_t<std::is_floating_point<T>::value>* = nullptr>
+  template <class T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
   void AccumulateBitwise(T*, T const*, std::size_t, Op) const {
     LOG(FATAL) << "Floating point types do not support bitwise operations.";
   }
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 3adc39e73777..00b2a65f85af 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -81,8 +81,8 @@ struct AtomicDispatcher<sizeof(uint64_t)> {
 
 // atomicAdd is not defined for size_t.
 template <typename T = size_t,
-          std::enable_if_t<std::is_same<size_t, T>::value &&
-                           !std::is_same<size_t, unsigned long long>::value> * =  // NOLINT
+          std::enable_if_t<std::is_same_v<size_t, T> &&
+                           !std::is_same_v<size_t, unsigned long long>> * =  // NOLINT
               nullptr>
 XGBOOST_DEV_INLINE T atomicAdd(T *addr, T v) {  // NOLINT
   using Type = typename dh::detail::AtomicDispatcher<sizeof(T)>::Type;
@@ -381,7 +381,7 @@ void CopyTo(Src const &src, Dst *dst) {
   dst->resize(src.size());
   using SVT = std::remove_cv_t<typename Src::value_type>;
   using DVT = std::remove_cv_t<typename Dst::value_type>;
-  static_assert(std::is_same<SVT, DVT>::value,
+  static_assert(std::is_same_v<SVT, DVT>,
                 "Host and device containers must have same value type.");
   dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(dst->data()), src.data(),
                                 src.size() * sizeof(SVT), cudaMemcpyDefault));
diff --git a/src/common/json.cc b/src/common/json.cc
index 2887eeccf855..da3df5366169 100644
--- a/src/common/json.cc
+++ b/src/common/json.cc
@@ -224,11 +224,11 @@ void JsonArray::Save(JsonWriter* writer) const { writer->Visit(this); }
 namespace {
 // error C2668: 'fpclassify': ambiguous call to overloaded function
 template <typename T>
-std::enable_if_t<std::is_floating_point<T>::value, bool> IsInfMSVCWar(T v) {
+std::enable_if_t<std::is_floating_point_v<T>, bool> IsInfMSVCWar(T v) {
   return std::isinf(v);
 }
 template <typename T>
-std::enable_if_t<std::is_integral<T>::value, bool> IsInfMSVCWar(T) {
+std::enable_if_t<std::is_integral_v<T>, bool> IsInfMSVCWar(T) {
   return false;
 }
 }  // namespace
@@ -247,7 +247,7 @@ bool JsonTypedArray<T, kind>::operator==(Value const& rhs) const {
   if (vec_.size() != arr.size()) {
     return false;
   }
-  if (std::is_same<float, T>::value) {
+  if (std::is_same_v<float, T>) {
     for (size_t i = 0; i < vec_.size(); ++i) {
       bool equal{false};
       if (common::CheckNAN(vec_[i])) {
@@ -693,10 +693,10 @@ void Json::Dump(Json json, JsonWriter* writer) {
   writer->Save(json);
 }
 
-static_assert(std::is_nothrow_move_constructible<Json>::value);
-static_assert(std::is_nothrow_move_constructible<Object>::value);
-static_assert(std::is_nothrow_move_constructible<Array>::value);
-static_assert(std::is_nothrow_move_constructible<String>::value);
+static_assert(std::is_nothrow_move_constructible_v<Json>);
+static_assert(std::is_nothrow_move_constructible_v<Object>);
+static_assert(std::is_nothrow_move_constructible_v<Array>);
+static_assert(std::is_nothrow_move_constructible_v<String>);
 
 Json UBJReader::ParseArray() {
   auto marker = PeekNextChar();
@@ -887,17 +887,17 @@ template <typename T, Value::ValueKind kind>
 void WriteTypedArray(JsonTypedArray<T, kind> const* arr, std::vector<char>* stream) {
   stream->emplace_back('[');
   stream->push_back('$');
-  if (std::is_same<T, float>::value) {
+  if (std::is_same_v<T, float>) {
     stream->push_back('d');
   } else if (std::is_same_v<T, double>) {
     stream->push_back('D');
-  } else if (std::is_same<T, int8_t>::value) {
+  } else if (std::is_same_v<T, int8_t>) {
     stream->push_back('i');
-  } else if (std::is_same<T, uint8_t>::value) {
+  } else if (std::is_same_v<T, uint8_t>) {
     stream->push_back('U');
-  } else if (std::is_same<T, int32_t>::value) {
+  } else if (std::is_same_v<T, int32_t>) {
     stream->push_back('l');
-  } else if (std::is_same<T, int64_t>::value) {
+  } else if (std::is_same_v<T, int64_t>) {
     stream->push_back('L');
   } else {
     LOG(FATAL) << "Not implemented";
diff --git a/src/common/math.h b/src/common/math.h
index be5ff7abd500..3c64ec39937b 100644
--- a/src/common/math.h
+++ b/src/common/math.h
@@ -12,7 +12,7 @@
 #include <algorithm>    // for max
 #include <cmath>        // for exp, abs, log, lgamma
 #include <limits>       // for numeric_limits
-#include <type_traits>  // for is_floating_point, conditional, is_signed, is_same, declval, enable_if
+#include <type_traits>  // for is_floating_point_v, conditional, is_signed, is_same, declval
 #include <utility>      // for pair
 
 namespace xgboost {
@@ -43,15 +43,11 @@ XGBOOST_DEVICE inline double Sigmoid(double x) {
  */
 template <typename T, typename U>
 XGBOOST_DEVICE constexpr bool CloseTo(T a, U b) {
-  using Casted =
-      typename std::conditional<
-        std::is_floating_point<T>::value || std::is_floating_point<U>::value,
-          double,
-          typename std::conditional<
-            std::is_signed<T>::value || std::is_signed<U>::value,
-            int64_t,
-            uint64_t>::type>::type;
-  return std::is_floating_point<Casted>::value ?
+  using Casted = typename std::conditional_t<
+      std::is_floating_point_v<T> || std::is_floating_point_v<U>, double,
+      typename std::conditional_t<std::is_signed_v<T> || std::is_signed_v<U>, std::int64_t,
+                                  std::uint64_t>>;
+  return std::is_floating_point_v<Casted> ?
       std::abs(static_cast<Casted>(a) -static_cast<Casted>(b)) < 1e-6 : a == b;
 }
 
@@ -65,11 +61,10 @@ XGBOOST_DEVICE constexpr bool CloseTo(T a, U b) {
  */
 template <typename Iterator>
 XGBOOST_DEVICE inline void Softmax(Iterator start, Iterator end) {
-  static_assert(std::is_same<bst_float,
-                typename std::remove_reference<
-                  decltype(std::declval<Iterator>().operator*())>::type
-                >::value,
-                "Values should be of type bst_float");
+  static_assert(
+      std::is_same_v<
+          float, typename std::remove_reference_t<decltype(std::declval<Iterator>().operator*())>>,
+      "Values should be of type bst_float");
   bst_float wmax = *start;
   for (Iterator i = start+1; i != end; ++i) {
     wmax = fmaxf(*i, wmax);
@@ -137,9 +132,7 @@ inline float LogSum(Iterator begin, Iterator end) {
 // Redefined here to workaround a VC bug that doesn't support overloading for integer
 // types.
 template <typename T>
-XGBOOST_DEVICE typename std::enable_if<
-  std::numeric_limits<T>::is_integer, bool>::type
-CheckNAN(T) {
+XGBOOST_DEVICE typename std::enable_if_t<std::numeric_limits<T>::is_integer, bool> CheckNAN(T) {
   return false;
 }
 
diff --git a/src/common/numeric.cc b/src/common/numeric.cc
index f1993231187d..e1abd3ad388d 100644
--- a/src/common/numeric.cc
+++ b/src/common/numeric.cc
@@ -1,9 +1,9 @@
-/*!
- * Copyright 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2024, XGBoost Contributors
  */
 #include "numeric.h"
 
-#include <type_traits>  // std::is_same
+#include <type_traits>  // std::is_same_v
 
 #include "xgboost/context.h"             // Context
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
@@ -16,7 +16,7 @@ double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
   } else {
     auto const& h_values = values.ConstHostVector();
     auto result = cpu_impl::Reduce(ctx, h_values.cbegin(), h_values.cend(), 0.0);
-    static_assert(std::is_same<decltype(result), double>::value);
+    static_assert(std::is_same_v<decltype(result), double>);
     return result;
   }
 }
diff --git a/src/common/numeric.h b/src/common/numeric.h
index 5b45bba8c03e..d54ba015cada 100644
--- a/src/common/numeric.h
+++ b/src/common/numeric.h
@@ -1,17 +1,18 @@
 /**
- * Copyright 2022-2023 by XGBoost contributors.
+ * Copyright 2022-2024, XGBoost contributors.
  */
 #ifndef XGBOOST_COMMON_NUMERIC_H_
 #define XGBOOST_COMMON_NUMERIC_H_
 
 #include <dmlc/common.h>  // OMPException
 
-#include <algorithm>  // for std::max
-#include <cstddef>    // for size_t
-#include <cstdint>    // for int32_t
-#include <iterator>   // for iterator_traits
-#include <numeric>    // for accumulate
-#include <vector>
+#include <algorithm>    // for max
+#include <cstddef>      // for size_t
+#include <cstdint>      // for int32_t
+#include <iterator>     // for iterator_traits
+#include <numeric>      // for accumulate
+#include <type_traits>  // for is_same_v
+#include <vector>       // for vector
 
 #include "common.h"                      // AssertGPUSupport
 #include "threading_utils.h"             // MemStackAllocator, DefaultMaxThreads
@@ -44,8 +45,8 @@ void RunLengthEncode(Iter begin, Iter end, std::vector<Idx>* p_out) {
  */
 template <typename InIt, typename OutIt, typename T>
 void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) {
-  static_assert(std::is_same<T, typename std::iterator_traits<InIt>::value_type>::value);
-  static_assert(std::is_same<T, typename std::iterator_traits<OutIt>::value_type>::value);
+  static_assert(std::is_same_v<T, typename std::iterator_traits<InIt>::value_type>);
+  static_assert(std::is_same_v<T, typename std::iterator_traits<OutIt>::value_type>);
   // The number of threads is pegged to the batch size. If the OMP block is parallelized
   // on anything other than the batch/block size, it should be reassigned
   auto n = static_cast<size_t>(std::distance(begin, end));
diff --git a/src/common/observer.h b/src/common/observer.h
index 33c10d53d17a..2abebc7a4470 100644
--- a/src/common/observer.h
+++ b/src/common/observer.h
@@ -105,9 +105,9 @@ class TrainingObserver {
 
   /*\brief Observe objects with `XGBoostParamer' type. */
   template <typename Parameter,
-            typename std::enable_if<
-              std::is_base_of<XGBoostParameter<Parameter>, Parameter>::value>::type* = nullptr>
-  void Observe(const Parameter &p, std::string name) const {
+            typename std::enable_if_t<std::is_base_of_v<XGBoostParameter<Parameter>, Parameter>>* =
+                nullptr>
+  void Observe(const Parameter& p, std::string name) const {
     if (XGBOOST_EXPECT(!kObserve, true)) { return; }
 
     Json obj {toJson(p)};
diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index eab37f45ed30..c6c665258c8a 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -8,8 +8,9 @@
 #include <thrust/transform_scan.h>
 #include <thrust/unique.h>
 
-#include <limits>   // std::numeric_limits
-#include <numeric>  // for partial_sum
+#include <limits>       // for numeric_limits
+#include <numeric>      // for partial_sum
+#include <type_traits>  // for is_same_v
 #include <utility>
 
 #include "../collective/allgather.h"
@@ -108,7 +109,7 @@ void PruneImpl(common::Span<SketchContainer::OffsetT const> cuts_ptr,
 template <typename T, typename U>
 void CopyTo(Span<T> out, Span<U> src) {
   CHECK_EQ(out.size(), src.size());
-  static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value);
+  static_assert(std::is_same_v<std::remove_cv_t<T>, std::remove_cv_t<T>>);
   dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(),
                                 out.size_bytes(),
                                 cudaMemcpyDefault));
diff --git a/src/common/stats.cuh b/src/common/stats.cuh
index 66ab3953ec87..4e38458b157f 100644
--- a/src/common/stats.cuh
+++ b/src/common/stats.cuh
@@ -15,7 +15,7 @@
 #include <cstddef>                                 // std::size_t
 #include <iterator>                                // std::distance
 #include <limits>                                  // std::numeric_limits
-#include <type_traits>                             // std::is_floating_point,std::iterator_traits
+#include <type_traits>                             // std::is_floating_point_v,std::iterator_traits
 
 #include "algorithm.cuh"                           // SegmentedArgMergeSort
 #include "cuda_context.cuh"                        // CUDAContext
@@ -37,9 +37,9 @@ struct QuantileSegmentOp {
   AlphaIt alpha_it;
   Span<float> d_results;
 
-  static_assert(std::is_floating_point<typename std::iterator_traits<ValIt>::value_type>::value,
+  static_assert(std::is_floating_point_v<typename std::iterator_traits<ValIt>::value_type>,
                 "Invalid value for quantile.");
-  static_assert(std::is_floating_point<typename std::iterator_traits<ValIt>::value_type>::value,
+  static_assert(std::is_floating_point_v<typename std::iterator_traits<ValIt>::value_type>,
                 "Invalid alpha.");
 
   XGBOOST_DEVICE void operator()(std::size_t seg_idx) {
@@ -102,9 +102,9 @@ struct WeightedQuantileSegOp {
   Span<float const> d_weight_cdf;
   Span<std::size_t const> d_sorted_idx;
   Span<float> d_results;
-  static_assert(std::is_floating_point<typename std::iterator_traits<AlphaIt>::value_type>::value,
+  static_assert(std::is_floating_point_v<typename std::iterator_traits<AlphaIt>::value_type>,
                 "Invalid alpha.");
-  static_assert(std::is_floating_point<typename std::iterator_traits<ValIt>::value_type>::value,
+  static_assert(std::is_floating_point_v<typename std::iterator_traits<ValIt>::value_type>,
                 "Invalid value for quantile.");
 
   XGBOOST_DEVICE void operator()(std::size_t seg_idx) {
@@ -146,7 +146,7 @@ auto MakeWQSegOp(SegIt seg_it, ValIt val_it, AlphaIt alpha_it, Span<float const>
  *    std::distance(seg_begin, seg_end) should be equal to n_segments + 1
  */
 template <typename SegIt, typename ValIt, typename AlphaIt,
-          std::enable_if_t<!std::is_floating_point<AlphaIt>::value>* = nullptr>
+          std::enable_if_t<!std::is_floating_point_v<AlphaIt>>* = nullptr>
 void SegmentedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_begin, SegIt seg_end,
                        ValIt val_begin, ValIt val_end, HostDeviceVector<float>* quantiles) {
   dh::device_vector<std::size_t> sorted_idx;
@@ -197,8 +197,8 @@ void SegmentedQuantile(Context const* ctx, double alpha, SegIt seg_begin, SegIt
  * @param w_begin  Iterator for weight for each input element
  */
 template <typename SegIt, typename ValIt, typename AlphaIt, typename WIter,
-          typename std::enable_if_t<!std::is_same<
-              typename std::iterator_traits<AlphaIt>::value_type, void>::value>* = nullptr>
+          typename std::enable_if_t<
+              !std::is_same_v<typename std::iterator_traits<AlphaIt>::value_type, void>>* = nullptr>
 void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_beg, SegIt seg_end,
                                ValIt val_begin, ValIt val_end, WIter w_begin, WIter w_end,
                                HostDeviceVector<float>* quantiles) {
diff --git a/src/common/stats.h b/src/common/stats.h
index 2f42a698e3d7..89321759bd12 100644
--- a/src/common/stats.h
+++ b/src/common/stats.h
@@ -49,7 +49,7 @@ float Quantile(Context const* ctx, double alpha, Iter const& begin, Iter const&
   }
 
   auto val = [&](size_t i) { return *(begin + sorted_idx[i]); };
-  static_assert(std::is_same<decltype(val(0)), float>::value);
+  static_assert(std::is_same_v<decltype(val(0)), float>);
 
   if (alpha <= (1 / (n + 1))) {
     return val(0);
diff --git a/src/common/transform.h b/src/common/transform.h
index 3329439a5323..688bc48da21c 100644
--- a/src/common/transform.h
+++ b/src/common/transform.h
@@ -128,7 +128,7 @@ class Transform {
     }
 
 #if defined(__CUDACC__)
-    template <typename std::enable_if<CompiledWithCuda>::type* = nullptr,
+    template <typename std::enable_if_t<CompiledWithCuda>* = nullptr,
               typename... HDV>
     void LaunchCUDA(Functor _func, HDV*... _vectors) const {
       UnpackShard(device_, _vectors...);
@@ -151,9 +151,8 @@ class Transform {
     }
 #else
     /*! \brief Dummy function defined when compiling for CPU.  */
-    template <typename std::enable_if<!CompiledWithCuda>::type* = nullptr,
-              typename... HDV>
-    void LaunchCUDA(Functor _func, HDV*...) const {
+    template <typename std::enable_if_t<!CompiledWithCuda> * = nullptr, typename... HDV>
+    void LaunchCUDA(Functor _func, HDV *...) const {
       // Remove unused parameter compiler warning.
       (void) _func;
 
diff --git a/src/data/array_interface.h b/src/data/array_interface.h
index f96ecd0c86a8..93fb55dd5626 100644
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -12,7 +12,7 @@
 #include <limits>       // for numeric_limits
 #include <map>          // for map
 #include <string>       // for string
-#include <type_traits>  // for alignment_of, remove_pointer_t, invoke_result_t
+#include <type_traits>  // for alignment_of_v, remove_pointer_t, invoke_result_t
 #include <vector>       // for vector
 
 #include "../common/bitfield.h"   // for RBitField8
@@ -334,7 +334,7 @@ struct ToDType<double> {
 };
 template <typename T>
 struct ToDType<T,
-               std::enable_if_t<std::is_same<T, long double>::value && sizeof(long double) == 16>> {
+               std::enable_if_t<std::is_same_v<T, long double> && sizeof(long double) == 16>> {
   static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF16;
 };
 // uint
@@ -555,7 +555,7 @@ class ArrayInterface {
   }
   [[nodiscard]] XGBOOST_DEVICE std::size_t ElementAlignment() const {
     return this->DispatchCall([](auto *typed_data_ptr) {
-      return std::alignment_of<std::remove_pointer_t<decltype(typed_data_ptr)>>::value;
+      return std::alignment_of_v<std::remove_pointer_t<decltype(typed_data_ptr)>>;
     });
   }
 
@@ -567,9 +567,8 @@ class ArrayInterface {
 #if defined(XGBOOST_USE_CUDA)
       // No operator defined for half -> size_t
       using Type = std::conditional_t<
-          std::is_same<__half,
-                       std::remove_cv_t<std::remove_pointer_t<decltype(p_values)>>>::value &&
-              std::is_same<std::size_t, std::remove_cv_t<T>>::value,
+          std::is_same_v<__half, std::remove_cv_t<std::remove_pointer_t<decltype(p_values)>>> &&
+              std::is_same_v<std::size_t, std::remove_cv_t<T>>,
           unsigned long long, T>;  // NOLINT
       return static_cast<T>(static_cast<Type>(p_values[offset]));
 #else
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index e4b82b7de59f..a9bac5062d03 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -294,16 +294,14 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
         IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>;
     // If AdapterT is either IteratorAdapter or FileAdapter type, use the total batch size to
     // determine the correct number of rows, as offset_vec may be too short
-    if (std::is_same<AdapterT, IteratorAdapterT>::value ||
-        std::is_same<AdapterT, FileAdapter>::value) {
+    if (std::is_same_v<AdapterT, IteratorAdapterT> || std::is_same_v<AdapterT, FileAdapter>) {
       info_.num_row_ = total_batch_size;
       // Ensure offset_vec.size() - 1 == [number of rows]
       while (offset_vec.size() - 1 < total_batch_size) {
         offset_vec.emplace_back(offset_vec.back());
       }
     } else {
-      CHECK((std::is_same<AdapterT, CSCAdapter>::value ||
-             std::is_same<AdapterT, CSCArrayAdapter>::value))
+      CHECK((std::is_same_v<AdapterT, CSCAdapter> || std::is_same_v<AdapterT, CSCArrayAdapter>))
           << "Expecting CSCAdapter";
       info_.num_row_ = offset_vec.size() - 1;
     }
diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc
index 36495d0caa88..d99307d107d4 100644
--- a/src/objective/lambdarank_obj.cc
+++ b/src/objective/lambdarank_obj.cc
@@ -344,7 +344,7 @@ class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
                               common::Span<double const> discount, bst_group_t g) {
     auto delta = [&](auto y_high, auto y_low, std::size_t rank_high, std::size_t rank_low,
                      bst_group_t g) {
-      static_assert(std::is_floating_point<decltype(y_high)>::value);
+      static_assert(std::is_floating_point_v<decltype(y_high)>);
       return DeltaNDCG<exp_gain>(y_high, y_low, rank_high, rank_low, inv_IDCG(g), discount);
     };
     this->CalcLambdaForGroup<unbiased>(iter, g_predt, g_label, w, g_rank, g, delta, g_gpair);
diff --git a/src/tree/constraints.cu b/src/tree/constraints.cu
index 121d800946b7..b222402fcfce 100644
--- a/src/tree/constraints.cu
+++ b/src/tree/constraints.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2023, XGBoost contributors
+ * Copyright 2019-2024, XGBoost contributors
  */
 #include <thrust/copy.h>
 #include <thrust/device_vector.h>
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 0131f166fa18..387eeda91b28 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -41,7 +41,7 @@ XGBOOST_DEVICE float LossChangeMissing(const GradientPairInt64 &scan,
 template <int kBlockSize>
 class EvaluateSplitAgent {
  public:
-  using ArgMaxT = cub::KeyValuePair<int, float>;
+  using ArgMaxT = cub::KeyValuePair<std::uint32_t, float>;
   using BlockScanT = cub::BlockScan<GradientPairInt64, kBlockSize>;
   using MaxReduceT = cub::WarpReduce<ArgMaxT>;
   using SumReduceT = cub::WarpReduce<GradientPairInt64>;
diff --git a/src/tree/io_utils.h b/src/tree/io_utils.h
index a0d31cc83bd3..7a8f055fe7c9 100644
--- a/src/tree/io_utils.h
+++ b/src/tree/io_utils.h
@@ -1,10 +1,10 @@
 /**
- * Copyright 2023 by XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #ifndef XGBOOST_TREE_IO_UTILS_H_
 #define XGBOOST_TREE_IO_UTILS_H_
 #include <string>          // for string
-#include <type_traits>     // for enable_if_t, is_same, conditional_t
+#include <type_traits>     // for enable_if_t, is_same_v, conditional_t
 #include <vector>          // for vector
 
 #include "xgboost/json.h"  // for Json
@@ -23,26 +23,24 @@ using IndexArrayT = std::conditional_t<feature_is_64, I64ArrayT<typed>, I32Array
 
 // typed array, not boolean
 template <typename JT, typename T>
-std::enable_if_t<!std::is_same<T, Json>::value && !std::is_same<JT, Boolean>::value, T> GetElem(
+std::enable_if_t<!std::is_same_v<T, Json> && !std::is_same_v<JT, Boolean>, T> GetElem(
     std::vector<T> const& arr, size_t i) {
   return arr[i];
 }
 // typed array boolean
 template <typename JT, typename T>
-std::enable_if_t<!std::is_same<T, Json>::value && std::is_same<T, uint8_t>::value &&
-                     std::is_same<JT, Boolean>::value,
-                 bool>
+std::enable_if_t<
+    !std::is_same_v<T, Json> && std::is_same_v<T, uint8_t> && std::is_same_v<JT, Boolean>, bool>
 GetElem(std::vector<T> const& arr, size_t i) {
   return arr[i] == 1;
 }
 // json array
 template <typename JT, typename T>
-std::enable_if_t<
-    std::is_same<T, Json>::value,
-    std::conditional_t<std::is_same<JT, Integer>::value, int64_t,
-                       std::conditional_t<std::is_same<Boolean, JT>::value, bool, float>>>
+std::enable_if_t<std::is_same_v<T, Json>,
+                 std::conditional_t<std::is_same_v<JT, Integer>, int64_t,
+                                    std::conditional_t<std::is_same_v<Boolean, JT>, bool, float>>>
 GetElem(std::vector<T> const& arr, size_t i) {
-  if (std::is_same<JT, Boolean>::value && !IsA<Boolean>(arr[i])) {
+  if (std::is_same_v<JT, Boolean> && !IsA<Boolean>(arr[i])) {
     return get<Integer const>(arr[i]) == 1;
   }
   return get<JT const>(arr[i]);
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index 9b28a08e594b..2a5a40b970a1 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -12,7 +12,7 @@
 #include <iomanip>
 #include <limits>
 #include <sstream>
-#include <type_traits>
+#include <type_traits>  // for is_floating_point_v
 
 #include "../common/categorical.h"  // for GetNodeCats
 #include "../common/common.h"       // for EscapeU8
@@ -35,7 +35,7 @@ namespace {
 template <typename Float>
 std::enable_if_t<std::is_floating_point_v<Float>, std::string> ToStr(Float value) {
   int32_t constexpr kFloatMaxPrecision = std::numeric_limits<float>::max_digits10;
-  static_assert(std::is_floating_point<Float>::value,
+  static_assert(std::is_floating_point_v<Float>,
                 "Use std::to_string instead for non-floating point values.");
   std::stringstream ss;
   ss << std::setprecision(kFloatMaxPrecision) << value;
@@ -45,7 +45,7 @@ std::enable_if_t<std::is_floating_point_v<Float>, std::string> ToStr(Float value
 template <typename Float>
 std::string ToStr(linalg::VectorView<Float> value, bst_target_t limit) {
   int32_t constexpr kFloatMaxPrecision = std::numeric_limits<float>::max_digits10;
-  static_assert(std::is_floating_point<Float>::value,
+  static_assert(std::is_floating_point_v<Float>,
                 "Use std::to_string instead for non-floating point values.");
   std::stringstream ss;
   ss << std::setprecision(kFloatMaxPrecision);
@@ -1091,8 +1091,8 @@ void LoadModelImpl(Json const& in, TreeParam const& param, std::vector<RTreeNode
   stats = std::remove_reference_t<decltype(stats)>(n_nodes);
   nodes = std::remove_reference_t<decltype(nodes)>(n_nodes);
 
-  static_assert(std::is_integral<decltype(GetElem<Integer>(lefts, 0))>::value);
-  static_assert(std::is_floating_point<decltype(GetElem<Number>(loss_changes, 0))>::value);
+  static_assert(std::is_integral_v<decltype(GetElem<Integer>(lefts, 0))>);
+  static_assert(std::is_floating_point_v<decltype(GetElem<Number>(loss_changes, 0))>);
 
   // Set node
   for (int32_t i = 0; i < n_nodes; ++i) {
diff --git a/tests/ci_build/Dockerfile.clang_tidy b/tests/ci_build/Dockerfile.clang_tidy
index 941e2a852b92..2e7751a20185 100644
--- a/tests/ci_build/Dockerfile.clang_tidy
+++ b/tests/ci_build/Dockerfile.clang_tidy
@@ -11,20 +11,28 @@ RUN \
     apt-get update && \
     apt-get install -y wget git python3 python3-pip software-properties-common \
                        apt-transport-https ca-certificates gnupg-agent && \
-    apt-get install -y llvm-15 clang-tidy-15 clang-15 libomp-15-dev && \
-    apt-get install -y cmake
+    apt-get install -y ninja-build
+
+# Install clang-tidy: https://apt.llvm.org/
+RUN \
+    apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-19 main" && \
+    wget -O llvm-snapshot.gpg.key https://apt.llvm.org/llvm-snapshot.gpg.key && \
+    apt-key add ./llvm-snapshot.gpg.key && \
+    rm llvm-snapshot.gpg.key && \
+    apt-get update && \
+    apt-get install -y clang-tidy-19 clang-19 libomp-19-dev
 
 # Set default clang-tidy version
 RUN \
-    update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-15 100 && \
-    update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 100
+    update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-19 100 && \
+    update-alternatives --install /usr/bin/clang clang /usr/bin/clang-19 100
 
 RUN \
     apt-get install libgtest-dev libgmock-dev -y
 
 # Install Python packages
 RUN \
-    pip3 install pyyaml
+    pip3 install cmake
 
 ENV GOSU_VERSION=1.10
 
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index f8bbbc2848b0..d2573e6f4915 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -16,6 +16,8 @@ class LintersPaths:
     BLACK = (
         # core
         "python-package/",
+        # CI
+        "tests/ci_build/tidy.py",
         # tests
         "tests/python/test_config.py",
         "tests/python/test_callback.py",
@@ -119,6 +121,7 @@ class LintersPaths:
         "demo/guide-python/learning_to_rank.py",
         "demo/aft_survival/aft_survival_viz_demo.py",
         # CI
+        "tests/ci_build/tidy.py",
         "tests/ci_build/lint_python.py",
         "tests/ci_build/test_r_package.py",
         "tests/ci_build/test_utils.py",
diff --git a/tests/ci_build/tidy.py b/tests/ci_build/tidy.py
index 7116eb78e039..13bbedc0b4b5 100755
--- a/tests/ci_build/tidy.py
+++ b/tests/ci_build/tidy.py
@@ -1,4 +1,6 @@
 #!/usr/bin/env python
+from __future__ import annotations
+
 import argparse
 import json
 import os
@@ -9,20 +11,17 @@
 from multiprocessing import Pool, cpu_count
 from time import time
 
-import yaml
 
+def call(args: list[str]) -> tuple[int, int, str, list[str]]:
+    """Subprocess run wrapper."""
+    completed = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    error_msg = completed.stdout.decode("utf-8")
+    # `workspace` is a name used in the CI container.  Normally we should keep the dir
+    # as `xgboost`.
+    matched = re.search(
+        "(workspace|xgboost)/.*(src|tests|include)/.*warning:", error_msg, re.MULTILINE
+    )
 
-def call(args):
-    '''Subprocess run wrapper.'''
-    completed = subprocess.run(args,
-                               stdout=subprocess.PIPE,
-                               stderr=subprocess.PIPE)
-    error_msg = completed.stdout.decode('utf-8')
-    # `workspace` is a name used in Jenkins CI.  Normally we should keep the
-    # dir as `xgboost`.
-    matched = re.search('(workspace|xgboost)/.*(src|tests|include)/.*warning:',
-                        error_msg,
-                        re.MULTILINE)
     if matched is None:
         return_code = 0
     else:
@@ -30,195 +29,203 @@ def call(args):
     return (completed.returncode, return_code, error_msg, args)
 
 
-class ClangTidy(object):
-    ''' clang tidy wrapper.
+class ClangTidy:
+    """clang tidy wrapper.
     Args:
       args:  Command line arguments.
           cpp_lint: Run linter on C++ source code.
           cuda_lint: Run linter on CUDA source code.
           use_dmlc_gtest: Whether to use gtest bundled in dmlc-core.
-    '''
-    def __init__(self, args):
+    """
+
+    def __init__(self, args: argparse.Namespace) -> None:
         self.cpp_lint = args.cpp
         self.cuda_lint = args.cuda
         self.use_dmlc_gtest: bool = args.use_dmlc_gtest
         self.cuda_archs = args.cuda_archs.copy() if args.cuda_archs else []
 
         if args.tidy_version:
-            self.exe = 'clang-tidy-' + str(args.tidy_version)
+            self.exe = "clang-tidy-" + str(args.tidy_version)
         else:
-            self.exe = 'clang-tidy'
+            self.exe = "clang-tidy"
 
-        print('Run linter on CUDA: ', self.cuda_lint)
-        print('Run linter on C++:', self.cpp_lint)
-        print('Use dmlc gtest:', self.use_dmlc_gtest)
-        print('CUDA archs:', ' '.join(self.cuda_archs))
+        print("Run linter on CUDA: ", self.cuda_lint)
+        print("Run linter on C++:", self.cpp_lint)
+        print("Use dmlc gtest:", self.use_dmlc_gtest)
+        print("CUDA archs:", " ".join(self.cuda_archs))
 
         if not self.cpp_lint and not self.cuda_lint:
-            raise ValueError('Both --cpp and --cuda are set to 0.')
+            raise ValueError("Both --cpp and --cuda are set to 0.")
         self.root_path = os.path.abspath(os.path.curdir)
-        print('Project root:', self.root_path)
-        self.cdb_path = os.path.join(self.root_path, 'cdb')
+        print("Project root:", self.root_path)
+        self.cdb_path = os.path.join(self.root_path, "cdb")
 
-    def __enter__(self):
+    def __enter__(self) -> "ClangTidy":
         self.start = time()
         if os.path.exists(self.cdb_path):
             shutil.rmtree(self.cdb_path)
         self._generate_cdb()
         return self
 
-    def __exit__(self, *args):
+    def __exit__(self, *args: list) -> None:
         if os.path.exists(self.cdb_path):
             shutil.rmtree(self.cdb_path)
         self.end = time()
-        print('Finish running clang-tidy:', self.end - self.start)
+        print("Finish running clang-tidy:", self.end - self.start)
 
-    def _generate_cdb(self):
-        '''Run CMake to generate compilation database.'''
+    def _generate_cdb(self) -> None:
+        """Run CMake to generate compilation database."""
         os.mkdir(self.cdb_path)
         os.chdir(self.cdb_path)
-        cmake_args = ['cmake', '..', '-DCMAKE_EXPORT_COMPILE_COMMANDS=ON',
-                      '-DGOOGLE_TEST=ON']
+        cmake_args = [
+            "cmake",
+            self.root_path,
+            "-GNinja",  # prevents cmake from using --option-files for include path.
+            "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON",
+            "-DGOOGLE_TEST=ON",
+            "-DCMAKE_CXX_FLAGS='-Wno-clang-diagnostic-deprecated-declarations'",
+        ]
         if self.use_dmlc_gtest:
-            cmake_args.append('-DUSE_DMLC_GTEST=ON')
+            cmake_args.append("-DUSE_DMLC_GTEST=ON")
         else:
-            cmake_args.append('-DUSE_DMLC_GTEST=OFF')
+            cmake_args.append("-DUSE_DMLC_GTEST=OFF")
 
         if self.cuda_lint:
-            cmake_args.extend(['-DUSE_CUDA=ON', '-DUSE_NCCL=ON'])
+            cmake_args.extend(["-DUSE_CUDA=ON", "-DUSE_NCCL=ON"])
             if self.cuda_archs:
-                arch_list = ';'.join(self.cuda_archs)
-                cmake_args.append(f'-DGPU_COMPUTE_VER={arch_list}')
+                arch_list = ";".join(self.cuda_archs)
+                cmake_args.append(f"-DCMAKE_CUDA_ARCHITECTURES={arch_list}")
         subprocess.run(cmake_args)
         os.chdir(self.root_path)
 
-    def convert_nvcc_command_to_clang(self, command):
-        '''Convert nvcc flags to corresponding clang flags.'''
+    def convert_nvcc_command_to_clang(self, command: str) -> str:
+        """Convert nvcc flags to corresponding clang flags."""
         components = command.split()
         compiler: str = components[0]
-        if compiler.find('nvcc') != -1:
-            compiler = 'clang++'
+        if compiler.find("nvcc") != -1:
+            compiler = "clang++"
             components[0] = compiler
         # check each component in a command
         converted_components = [compiler]
 
         for i in range(1, len(components)):
-            if components[i] == '-lineinfo':
+            if components[i] == "-lineinfo":
                 continue
-            elif components[i] == '-fuse-ld=gold':
+            elif components[i] == "-fuse-ld=gold":
                 continue
-            elif components[i] == '-rdynamic':
+            elif components[i] == "-fuse-ld=lld":
+                continue
+            elif components[i].find("--default-stream") != -1:
+                continue
+            elif components[i] == "-rdynamic":
                 continue
             elif components[i] == "-Xfatbin=-compress-all":
                 continue
             elif components[i] == "-forward-unknown-to-host-compiler":
                 continue
-            elif (components[i] == '-x' and
-                  components[i+1] == 'cu'):
+            elif components[i] == "-x" and components[i + 1] == "cu":
                 # -x cu -> -x cuda
-                converted_components.append('-x')
-                converted_components.append('cuda')
-                components[i+1] = ''
+                converted_components.append("-x")
+                converted_components.append("cuda")
+                components[i + 1] = ""
                 continue
-            elif components[i].find('-Xcompiler') != -1:
+            elif components[i].find("-Xcompiler") != -1:
                 continue
-            elif components[i].find('--expt') != -1:
+            elif components[i].find("--expt-") != -1:
                 continue
-            elif components[i].find('-ccbin') != -1:
+            elif components[i].find("-ccbin") != -1:
                 continue
-            elif components[i].find('--generate-code') != -1:
-                keyword = 'code=sm'
+            elif components[i].find("--generate-code") != -1:
+                keyword = "code=sm"
                 pos = components[i].find(keyword)
-                capability = components[i][pos + len(keyword) + 1:
-                                           pos + len(keyword) + 3]
+                capability = components[i][
+                    pos + len(keyword) + 1 : pos + len(keyword) + 3
+                ]
                 if pos != -1:
-                    converted_components.append(
-                        '--cuda-gpu-arch=sm_' + capability)
-            elif components[i].find('--std=c++14') != -1:
-                converted_components.append('-std=c++14')
-            elif components[i].startswith('-isystem='):
-                converted_components.extend(components[i].split('='))
+                    converted_components.append("--cuda-gpu-arch=sm_" + capability)
+            elif components[i].find("--std=c++14") != -1:
+                converted_components.append("-std=c++14")
+            elif components[i].startswith("-isystem="):
+                converted_components.extend(components[i].split("="))
             else:
                 converted_components.append(components[i])
 
-        converted_components.append('-isystem /usr/local/cuda/include/')
+        converted_components.append("-isystem /usr/local/cuda/include/")
 
-        command = ''
+        command = ""
         for c in converted_components:
-            command = command + ' ' + c
+            command = command + " " + c
         command = command.strip()
         return command
 
-    def _configure_flags(self, path, command):
-        src = os.path.join(self.root_path, 'src')
-        src = src.replace('/', '\\/')
-        include = os.path.join(self.root_path, 'include')
-        include = include.replace('/', '\\/')
-
-        header_filter = '(' + src + '|' + include + ')'
-        common_args = [self.exe,
-                       "-header-filter=" + header_filter,
-                       '-config='+self.clang_tidy]
-        common_args.append(path)
-        common_args.append('--')
+    def _configure_flags(self, path: str, command: str) -> list[list[str]]:
+        src = os.path.join(self.root_path, "src").replace("/", "\\/")
+        include = os.path.join(self.root_path, "include").replace("/", "\\/")
+
+        header_filter = "(" + src + "|" + include + ")"
+        common_args = [
+            self.exe,
+            path,
+            "--header-filter=" + header_filter,
+            "--config-file=" + self.tidy_file,
+        ]
+        common_args.append("--")
         command = self.convert_nvcc_command_to_clang(command)
 
-        command = command.split()[1:]  # remove clang/c++/g++
-        if '-c' in command:
-            index = command.index('-c')
-            del command[index+1]
-            command.remove('-c')
-        if '-o' in command:
-            index = command.index('-o')
-            del command[index+1]
-            command.remove('-o')
+        command_split = command.split()[1:]  # remove clang/c++/g++
+        if "-c" in command_split:
+            index = command_split.index("-c")
+            del command_split[index + 1]
+            command_split.remove("-c")
+        if "-o" in command_split:
+            index = command_split.index("-o")
+            del command_split[index + 1]
+            command_split.remove("-o")
 
-        common_args.extend(command)
+        common_args.extend(command_split)
 
         # Two passes, one for device code another for host code.
-        if path.endswith('cu'):
+        if path.endswith("cu"):
             args = [common_args.copy(), common_args.copy()]
-            args[0].append('--cuda-host-only')
-            args[1].append('--cuda-device-only')
+            args[0].append("--cuda-host-only")
+            args[1].append("--cuda-device-only")
         else:
             args = [common_args.copy()]
         for a in args:
-            a.append('-Wno-unused-command-line-argument')
+            a.append("-Wno-unused-command-line-argument")
         return args
 
-    def _configure(self):
-        '''Load and configure compile_commands and clang_tidy.'''
+    def _configure(self) -> list[list[str]]:
+        """Load and configure compile_commands and clang_tidy."""
 
-        def should_lint(path):
-            if not self.cpp_lint and path.endswith('.cc'):
+        def should_lint(path: str) -> bool:
+            if not self.cpp_lint and path.endswith(".cc"):
                 return False
-            isxgb = path.find('dmlc-core') == -1
+            isxgb = path.find("dmlc-core") == -1
             isxgb = isxgb and (not path.startswith(self.cdb_path))
             if isxgb:
                 print(path)
                 return True
+            return False
 
-        cdb_file = os.path.join(self.cdb_path, 'compile_commands.json')
-        with open(cdb_file, 'r') as fd:
+        cdb_file = os.path.join(self.cdb_path, "compile_commands.json")
+        with open(cdb_file, "r") as fd:
             self.compile_commands = json.load(fd)
 
-        tidy_file = os.path.join(self.root_path, '.clang-tidy')
-        with open(tidy_file) as fd:
-            self.clang_tidy = yaml.safe_load(fd)
-            self.clang_tidy = str(self.clang_tidy)
+        self.tidy_file = os.path.join(self.root_path, ".clang-tidy")
         all_files = []
         for entry in self.compile_commands:
-            path = entry['file']
+            path = entry["file"]
             if should_lint(path):
-                args = self._configure_flags(path, entry['command'])
+                args = self._configure_flags(path, entry["command"])
                 all_files.extend(args)
         return all_files
 
-    def run(self):
-        '''Run clang-tidy.'''
+    def run(self) -> bool:
+        """Run clang-tidy."""
         all_files = self._configure()
         passed = True
-        BAR = '-'*32
+        BAR = "-" * 32
         with Pool(cpu_count()) as pool:
             results = pool.map(call, all_files)
             for i, (process_status, tidy_status, msg, args) in enumerate(results):
@@ -226,54 +233,50 @@ def run(self):
                 # for cub in thrust is not correct.
                 if tidy_status == 1:
                     passed = False
-                    print(BAR, '\n'
-                          'Command args:', ' '.join(args), ', ',
-                          'Process return code:', process_status, ', ',
-                          'Tidy result code:', tidy_status, ', ',
-                          'Message:\n', msg,
-                          BAR, '\n')
+                    print(
+                        BAR,
+                        "\n" "Command args:",
+                        " ".join(args),
+                        ", ",
+                        "Process return code:",
+                        process_status,
+                        ", ",
+                        "Tidy result code:",
+                        tidy_status,
+                        ", ",
+                        "Message:\n",
+                        msg,
+                        BAR,
+                        "\n",
+                    )
         if not passed:
-            print('Errors in `thrust` namespace can be safely ignored.',
-                  'Please address rest of the clang-tidy warnings.')
+            print(
+                "Errors in `thrust` namespace can be safely ignored.",
+                "Please address rest of the clang-tidy warnings.",
+            )
         return passed
 
 
-def test_tidy(args):
-    '''See if clang-tidy and our regex is working correctly.  There are
-many subtleties we need to be careful.  For instances:
-
-    * Is the string re-directed to pipe encoded as UTF-8? or is it
-bytes?
+def test_tidy(args: argparse.Namespace) -> None:
+    """See if clang-tidy and our regex is working correctly.  There are many subtleties
+    we need to be careful. Tests here are not thorough, at least we want to guarantee
+    tidy is not missing anything on the CI.
 
-    * On Jenkins there's no 'xgboost' directory, are we catching the
-right keywords?
-
-    * Should we use re.DOTALL?
-
-    * Should we use re.MULTILINE?
-
-    Tests here are not thorough, at least we want to guarantee tidy is
-    not missing anything on Jenkins.
-
-    '''
+    """
     root_path = os.path.abspath(os.path.curdir)
-    tidy_file = os.path.join(root_path, '.clang-tidy')
-    test_file_path = os.path.join(root_path,
-                                  'tests', 'ci_build', 'test_tidy.cc')
-
-    with open(tidy_file) as fd:
-        tidy_config = fd.read()
-        tidy_config = str(tidy_config)
-    tidy_config = '-config='+tidy_config
+    tidy_file = os.path.join(root_path, ".clang-tidy")
+    test_file_path = os.path.join(root_path, "tests", "ci_build", "test_tidy.cc")
+
+    tidy_config = "--config-file=" + tidy_file
     if not args.tidy_version:
-        tidy = 'clang-tidy'
+        tidy = "clang-tidy"
     else:
-        tidy = 'clang-tidy-' + str(args.tidy_version)
-    args = [tidy, tidy_config, test_file_path]
-    (proc_code, tidy_status, error_msg, _) = call(args)
+        tidy = "clang-tidy-" + str(args.tidy_version)
+    cmd = [tidy, tidy_config, test_file_path]
+    (proc_code, tidy_status, error_msg, _) = call(cmd)
     assert proc_code == 0
     assert tidy_status == 1
-    print('clang-tidy is working.')
+    print("clang-tidy is working.")
 
 
 if __name__ == "__main__":
diff --git a/tests/cpp/common/test_intrusive_ptr.cc b/tests/cpp/common/test_intrusive_ptr.cc
index 5b07476255a4..3b6f4083d55e 100644
--- a/tests/cpp/common/test_intrusive_ptr.cc
+++ b/tests/cpp/common/test_intrusive_ptr.cc
@@ -1,3 +1,6 @@
+/**
+ * Copyright 2020-2024, XGBoost contributors
+ */
 #include <gtest/gtest.h>
 #include <xgboost/intrusive_ptr.h>
 
@@ -12,10 +15,8 @@ class NotCopyConstructible {
   NotCopyConstructible &operator=(NotCopyConstructible const &that) = delete;
   NotCopyConstructible(NotCopyConstructible&& that) = default;
 };
-static_assert(
-    !std::is_trivially_copy_constructible<NotCopyConstructible>::value);
-static_assert(
-    !std::is_trivially_copy_assignable<NotCopyConstructible>::value);
+static_assert(!std::is_trivially_copy_constructible_v<NotCopyConstructible>);
+static_assert(!std::is_trivially_copy_assignable_v<NotCopyConstructible>);
 
 class ForIntrusivePtrTest {
  public:
diff --git a/tests/cpp/common/test_span.cc b/tests/cpp/common/test_span.cc
index b29c562bc3eb..486896c24891 100644
--- a/tests/cpp/common/test_span.cc
+++ b/tests/cpp/common/test_span.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2018-2023, XGBoost contributors
+ * Copyright 2018-2024, XGBoost contributors
  */
 #include "test_span.h"
 
@@ -174,19 +174,11 @@ TEST(Span, FromFirstLast) {
   }
 }
 
-struct BaseClass {
-  virtual void operator()() {}
-};
-struct DerivedClass : public BaseClass {
-  void operator()() override {}
-};
-
 TEST(Span, FromOther) {
-
   // convert constructor
   {
-    Span<DerivedClass> derived;
-    Span<BaseClass> base { derived };
+    Span<int> derived;
+    Span<int const> base{derived};
     ASSERT_EQ(base.size(), derived.size());
     ASSERT_EQ(base.data(), derived.data());
   }
diff --git a/tests/cpp/data/test_adapter.cc b/tests/cpp/data/test_adapter.cc
index f34cfceed2f3..6833dc19e46d 100644
--- a/tests/cpp/data/test_adapter.cc
+++ b/tests/cpp/data/test_adapter.cc
@@ -93,11 +93,11 @@ TEST(Adapter, CSCAdapterColsMoreThanRows) {
 
 // A mock for JVM data iterator.
 class CSRIterForTest {
-  std::vector<float> data_ {1, 2, 3, 4, 5};
-  std::vector<std::remove_pointer<decltype(std::declval<XGBoostBatchCSR>().index)>::type>
-      feature_idx_ {0, 1, 0, 1, 1};
-  std::vector<std::remove_pointer<decltype(std::declval<XGBoostBatchCSR>().offset)>::type>
-      row_ptr_ {0, 2, 4, 5, 5};
+  std::vector<float> data_{1, 2, 3, 4, 5};
+  std::vector<std::remove_pointer_t<decltype(std::declval<XGBoostBatchCSR>().index)>> feature_idx_{
+      0, 1, 0, 1, 1};
+  std::vector<std::remove_pointer_t<decltype(std::declval<XGBoostBatchCSR>().offset)>> row_ptr_{
+      0, 2, 4, 5, 5};
   size_t iter_ {0};
 
  public:
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index b52d49176ef6..a557b7f622b9 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -49,7 +49,7 @@ void TestSparseDMatrixLoadFile(Context const* ctx) {
                              1};
   Page out;
   for (auto const &page : m.GetBatches<Page>(ctx)) {
-    if (std::is_same<Page, SparsePage>::value) {
+    if (std::is_same_v<Page, SparsePage>) {
       out.Push(page);
     } else {
       out.PushCSC(page);
@@ -89,7 +89,7 @@ void TestRetainPage() {
   for (auto it = begin; it != end; ++it) {
     iterators.push_back(it.Page());
     pages.emplace_back(Page{});
-    if (std::is_same<Page, SparsePage>::value) {
+    if (std::is_same_v<Page, SparsePage>) {
       pages.back().Push(*it);
     } else {
       pages.back().PushCSC(*it);
@@ -105,7 +105,7 @@ void TestRetainPage() {
 
   // make sure it's const and the caller can not modify the content of page.
   for (auto &page : p_fmat->GetBatches<Page>({&ctx})) {
-    static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value);
+    static_assert(std::is_const_v<std::remove_reference_t<decltype(page)>>);
   }
 }
 
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index f74ca28eb85e..81940c5a6867 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -166,7 +166,7 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
 
   // make sure it's const and the caller can not modify the content of page.
   for (auto& page : m->GetBatches<EllpackPage>(&ctx, param)) {
-    static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value);
+    static_assert(std::is_const_v<std::remove_reference_t<decltype(page)>>);
   }
 
   // The above iteration clears out all references inside DMatrix.
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index 74fd6ec5ff79..518e4d0248ea 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -62,7 +62,7 @@ void TestPartitioner(bst_target_t n_targets) {
       auto ptr = gmat.cut.Ptrs()[split_ind + 1];
       float split_value = gmat.cut.Values().at(ptr / 2);
       RegTree tree{n_targets, n_features};
-      if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
+      if constexpr (std::is_same_v<ExpandEntry, CPUExpandEntry>) {
         GetSplit(&tree, split_value, &candidates);
       } else {
         GetMultiSplitForTest(&tree, split_value, &candidates);
@@ -119,7 +119,7 @@ void VerifyColumnSplitPartitioner(bst_target_t n_targets, size_t n_samples,
     {
       RegTree tree{n_targets, n_features};
       CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true};
-      if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
+      if constexpr (std::is_same_v<ExpandEntry, CPUExpandEntry>) {
         GetSplit(&tree, min_value, &candidates);
       } else {
         GetMultiSplitForTest(&tree, min_value, &candidates);
@@ -132,7 +132,7 @@ void VerifyColumnSplitPartitioner(bst_target_t n_targets, size_t n_samples,
     {
       RegTree tree{n_targets, n_features};
       CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true};
-      if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
+      if constexpr (std::is_same_v<ExpandEntry, CPUExpandEntry>) {
         GetSplit(&tree, mid_value, &candidates);
       } else {
         GetMultiSplitForTest(&tree, mid_value, &candidates);
@@ -187,7 +187,7 @@ void TestColumnSplitPartitioner(bst_target_t n_targets) {
     auto ptr = gmat.cut.Ptrs()[split_ind + 1];
     mid_value = gmat.cut.Values().at(ptr / 2);
     RegTree tree{n_targets, n_features};
-    if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
+    if constexpr (std::is_same_v<ExpandEntry, CPUExpandEntry>) {
       GetSplit(&tree, mid_value, &candidates);
     } else {
       GetMultiSplitForTest(&tree, mid_value, &candidates);

From 142bdc73ece531d8681779763e473e8df27510a9 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 22 Aug 2024 05:25:10 +0800
Subject: [PATCH 19/19] [EM] Support SHAP contribution with QDM. (#10724)

- Add GPU support.
- Add external memory support.
- Update the GPU tree shap.
---
 gputreeshap                                |   2 +-
 src/predictor/gpu_predictor.cu             | 137 +++++++++++----------
 tests/cpp/data/test_simple_dmatrix.cc      |  11 +-
 tests/cpp/gbm/test_gbtree.cc               |  10 +-
 tests/cpp/helpers.cc                       |  63 +++++-----
 tests/cpp/helpers.h                        |   8 +-
 tests/cpp/objective/test_lambdarank_obj.cc |   3 +-
 tests/cpp/predictor/test_cpu_predictor.cc  |  10 +-
 tests/cpp/predictor/test_gpu_predictor.cu  |  14 ++-
 tests/cpp/predictor/test_predictor.cc      | 133 ++++++++++++++------
 tests/cpp/predictor/test_predictor.h       |   8 +-
 tests/cpp/test_learner.cc                  |   2 +-
 tests/python-gpu/test_gpu_prediction.py    |  32 ++++-
 13 files changed, 274 insertions(+), 159 deletions(-)

diff --git a/gputreeshap b/gputreeshap
index 787259b412c1..40eae8c4c459 160000
--- a/gputreeshap
+++ b/gputreeshap
@@ -1 +1 @@
-Subproject commit 787259b412c18ab8d5f24bf2b8bd6a59ff8208f3
+Subproject commit 40eae8c4c45974705f8053e4d3d05b88e3cfaefd
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 570872aa52ad..38d6eca4d8b0 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -143,10 +143,9 @@ struct SparsePageLoader {
 };
 
 struct EllpackLoader {
-  EllpackDeviceAccessor const& matrix;
-  XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor const& m, bool, bst_feature_t, bst_idx_t,
-                               float)
-      : matrix{m} {}
+  EllpackDeviceAccessor matrix;
+  XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor m, bool, bst_feature_t, bst_idx_t, float)
+      : matrix{std::move(m)} {}
   [[nodiscard]] XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
     auto gidx = matrix.GetBinIndex<false>(ridx, fidx);
     if (gidx == -1) {
@@ -162,6 +161,8 @@ struct EllpackLoader {
     }
     return matrix.gidx_fvalue_map[gidx - 1];
   }
+  [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumCols() const { return this->matrix.NumFeatures(); }
+  [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumRows() const { return this->matrix.n_rows; }
 };
 
 template <typename Batch>
@@ -1031,9 +1032,6 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_weights != nullptr) {
       LOG(FATAL) << "Dart booster feature " << not_implemented;
     }
-    if (!p_fmat->PageExists<SparsePage>()) {
-      LOG(FATAL) << "SHAP value for QuantileDMatrix is not yet implemented for GPU.";
-    }
     CHECK(!p_fmat->Info().IsColumnSplit())
         << "Predict contribution support for column-wise data split is not yet implemented.";
     dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
@@ -1047,8 +1045,8 @@ class GPUPredictor : public xgboost::Predictor {
     // allocate space for (number of features + bias) times the number of rows
     size_t contributions_columns =
         model.learner_model_param->num_feature + 1;  // +1 for bias
-    out_contribs->Resize(p_fmat->Info().num_row_ * contributions_columns *
-                    model.learner_model_param->num_output_group);
+    auto dim_size = contributions_columns * model.learner_model_param->num_output_group;
+    out_contribs->Resize(p_fmat->Info().num_row_ * dim_size);
     out_contribs->Fill(0.0f);
     auto phis = out_contribs->DeviceSpan();
 
@@ -1058,16 +1056,27 @@ class GPUPredictor : public xgboost::Predictor {
     d_model.Init(model, 0, tree_end, ctx_->Device());
     dh::device_vector<uint32_t> categories;
     ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
-    for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
-      batch.data.SetDevice(ctx_->Device());
-      batch.offset.SetDevice(ctx_->Device());
-      SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
-                       model.learner_model_param->num_feature);
-      auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
-      gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int>>(
-          X, device_paths.begin(), device_paths.end(), ngroup, begin,
-          dh::tend(phis));
+    if (p_fmat->PageExists<SparsePage>()) {
+      for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
+        batch.data.SetDevice(ctx_->Device());
+        batch.offset.SetDevice(ctx_->Device());
+        SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
+                         model.learner_model_param->num_feature);
+        auto begin = dh::tbegin(phis) + batch.base_rowid * dim_size;
+        gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int>>(
+            X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+      }
+    } else {
+      for (auto& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
+        EllpackDeviceAccessor acc{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
+        auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
+                               std::numeric_limits<float>::quiet_NaN()};
+        auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
+        gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int>>(
+            X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+      }
     }
+
     // Add the base margin term to last column
     p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
     const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
@@ -1094,9 +1103,6 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_weights != nullptr) {
       LOG(FATAL) << "Dart booster feature " << not_implemented;
     }
-    if (!p_fmat->PageExists<SparsePage>()) {
-      LOG(FATAL) << "SHAP value for QuantileDMatrix is not yet implemented for GPU.";
-    }
     dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     out_contribs->SetDevice(ctx_->Device());
     if (tree_end == 0 || tree_end > model.trees.size()) {
@@ -1108,9 +1114,9 @@ class GPUPredictor : public xgboost::Predictor {
     // allocate space for (number of features + bias) times the number of rows
     size_t contributions_columns =
         model.learner_model_param->num_feature + 1;  // +1 for bias
-    out_contribs->Resize(p_fmat->Info().num_row_ * contributions_columns *
-                         contributions_columns *
-                         model.learner_model_param->num_output_group);
+    auto dim_size =
+        contributions_columns * contributions_columns * model.learner_model_param->num_output_group;
+    out_contribs->Resize(p_fmat->Info().num_row_ * dim_size);
     out_contribs->Fill(0.0f);
     auto phis = out_contribs->DeviceSpan();
 
@@ -1120,16 +1126,29 @@ class GPUPredictor : public xgboost::Predictor {
     d_model.Init(model, 0, tree_end, ctx_->Device());
     dh::device_vector<uint32_t> categories;
     ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
-    for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
-      batch.data.SetDevice(ctx_->Device());
-      batch.offset.SetDevice(ctx_->Device());
-      SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
-                       model.learner_model_param->num_feature);
-      auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
-      gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int>>(
-          X, device_paths.begin(), device_paths.end(), ngroup, begin,
-          dh::tend(phis));
+    if (p_fmat->PageExists<SparsePage>()) {
+      for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
+        batch.data.SetDevice(ctx_->Device());
+        batch.offset.SetDevice(ctx_->Device());
+        SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
+                         model.learner_model_param->num_feature);
+        auto begin = dh::tbegin(phis) + batch.base_rowid * dim_size;
+        gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int>>(
+            X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+      }
+    } else {
+      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
+        auto impl = batch.Impl();
+        auto acc =
+            impl->GetDeviceAccessor(ctx_->Device(), p_fmat->Info().feature_types.ConstDeviceSpan());
+        auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
+        auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
+                               std::numeric_limits<float>::quiet_NaN()};
+        gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int>>(
+            X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+      }
     }
+
     // Add the base margin term to last column
     p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
     const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
@@ -1180,51 +1199,35 @@ class GPUPredictor : public xgboost::Predictor {
     bool use_shared = shared_memory_bytes != 0;
     bst_feature_t num_features = info.num_col_;
 
+    auto launch = [&](auto fn, std::uint32_t grid, auto data, bst_idx_t batch_offset) {
+      dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes}(
+          fn, data, d_model.nodes.ConstDeviceSpan(),
+          predictions->DeviceSpan().subspan(batch_offset), d_model.tree_segments.ConstDeviceSpan(),
+
+          d_model.split_types.ConstDeviceSpan(), d_model.categories_tree_segments.ConstDeviceSpan(),
+          d_model.categories_node_segments.ConstDeviceSpan(), d_model.categories.ConstDeviceSpan(),
+
+          d_model.tree_beg_, d_model.tree_end_, num_features, num_rows, use_shared,
+          std::numeric_limits<float>::quiet_NaN());
+    };
+
     if (p_fmat->PageExists<SparsePage>()) {
+      bst_idx_t batch_offset = 0;
       for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
         batch.data.SetDevice(ctx_->Device());
         batch.offset.SetDevice(ctx_->Device());
-        bst_idx_t batch_offset = 0;
         SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                             model.learner_model_param->num_feature};
-        size_t num_rows = batch.Size();
-        auto grid =
-            static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
-        dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes} (
-            PredictLeafKernel<SparsePageLoader, SparsePageView>, data,
-            d_model.nodes.ConstDeviceSpan(),
-            predictions->DeviceSpan().subspan(batch_offset),
-            d_model.tree_segments.ConstDeviceSpan(),
-
-            d_model.split_types.ConstDeviceSpan(),
-            d_model.categories_tree_segments.ConstDeviceSpan(),
-            d_model.categories_node_segments.ConstDeviceSpan(),
-            d_model.categories.ConstDeviceSpan(),
-
-            d_model.tree_beg_, d_model.tree_end_, num_features, num_rows,
-            use_shared, std::numeric_limits<float>::quiet_NaN());
+        auto grid = static_cast<std::uint32_t>(common::DivRoundUp(batch.Size(), kBlockThreads));
+        launch(PredictLeafKernel<SparsePageLoader, SparsePageView>, grid, data, batch_offset);
         batch_offset += batch.Size();
       }
     } else {
+      bst_idx_t batch_offset = 0;
       for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
-        bst_idx_t batch_offset = 0;
         EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
-        size_t num_rows = batch.Size();
-        auto grid =
-            static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
-        dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes} (
-            PredictLeafKernel<EllpackLoader, EllpackDeviceAccessor>, data,
-            d_model.nodes.ConstDeviceSpan(),
-            predictions->DeviceSpan().subspan(batch_offset),
-            d_model.tree_segments.ConstDeviceSpan(),
-
-            d_model.split_types.ConstDeviceSpan(),
-            d_model.categories_tree_segments.ConstDeviceSpan(),
-            d_model.categories_node_segments.ConstDeviceSpan(),
-            d_model.categories.ConstDeviceSpan(),
-
-            d_model.tree_beg_, d_model.tree_end_, num_features, num_rows,
-            use_shared, std::numeric_limits<float>::quiet_NaN());
+        auto grid = static_cast<std::uint32_t>(common::DivRoundUp(batch.Size(), kBlockThreads));
+        launch(PredictLeafKernel<EllpackLoader, EllpackDeviceAccessor>, grid, data, batch_offset);
         batch_offset += batch.Size();
       }
     }
diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc
index ea6eedbb2e7b..16448c2e197f 100644
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2016-2023 by XGBoost Contributors
+ * Copyright 2016-2024, XGBoost Contributors
  */
 #include <xgboost/data.h>
 
@@ -434,12 +434,11 @@ namespace {
 void VerifyColumnSplit() {
   size_t constexpr kRows {16};
   size_t constexpr kCols {8};
-  auto dmat =
-      RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(false, false, 1, DataSplitMode::kCol);
+  auto p_fmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(false, DataSplitMode::kCol);
 
-  ASSERT_EQ(dmat->Info().num_col_, kCols * collective::GetWorldSize());
-  ASSERT_EQ(dmat->Info().num_row_, kRows);
-  ASSERT_EQ(dmat->Info().data_split_mode, DataSplitMode::kCol);
+  ASSERT_EQ(p_fmat->Info().num_col_, kCols * collective::GetWorldSize());
+  ASSERT_EQ(p_fmat->Info().num_row_, kRows);
+  ASSERT_EQ(p_fmat->Info().data_split_mode, DataSplitMode::kCol);
 }
 }  // anonymous namespace
 
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index 8a5383ad4d34..79e236f11a53 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2023, XGBoost contributors
+ * Copyright 2019-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>
@@ -463,7 +463,7 @@ INSTANTIATE_TEST_SUITE_P(PredictorTypes, Dart, testing::Values("CPU"));
 
 std::pair<Json, Json> TestModelSlice(std::string booster) {
   size_t constexpr kRows = 1000, kCols = 100, kForest = 2, kClasses = 3;
-  auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true, false, kClasses);
+  auto m = RandomDataGenerator{kRows, kCols, 0}.Classes(kClasses).GenerateDMatrix(true);
 
   int32_t kIters = 10;
   std::unique_ptr<Learner> learner {
@@ -592,7 +592,7 @@ TEST(Dart, Slice) {
 
 TEST(GBTree, FeatureScore) {
   size_t n_samples = 1000, n_features = 10, n_classes = 4;
-  auto m = RandomDataGenerator{n_samples, n_features, 0.5}.GenerateDMatrix(true, false, n_classes);
+  auto m = RandomDataGenerator{n_samples, n_features, 0.5}.Classes(n_classes).GenerateDMatrix(true);
 
   std::unique_ptr<Learner> learner{ Learner::Create({m}) };
   learner->SetParam("num_class", std::to_string(n_classes));
@@ -629,7 +629,7 @@ TEST(GBTree, FeatureScore) {
 
 TEST(GBTree, PredictRange) {
   size_t n_samples = 1000, n_features = 10, n_classes = 4;
-  auto m = RandomDataGenerator{n_samples, n_features, 0.5}.GenerateDMatrix(true, false, n_classes);
+  auto m = RandomDataGenerator{n_samples, n_features, 0.5}.Classes(n_classes).GenerateDMatrix(true);
 
   std::unique_ptr<Learner> learner{Learner::Create({m})};
   learner->SetParam("num_class", std::to_string(n_classes));
@@ -642,7 +642,7 @@ TEST(GBTree, PredictRange) {
   ASSERT_THROW(learner->Predict(m, false, &out_predt, 0, 3), dmlc::Error);
 
   auto m_1 =
-      RandomDataGenerator{n_samples, n_features, 0.5}.GenerateDMatrix(true, false, n_classes);
+      RandomDataGenerator{n_samples, n_features, 0.5}.Classes(n_classes).GenerateDMatrix(true);
   HostDeviceVector<float> out_predt_full;
   learner->Predict(m_1, false, &out_predt_full, 0, 0);
   ASSERT_TRUE(std::equal(out_predt.HostVector().begin(), out_predt.HostVector().end(),
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index ae5698d2cc6e..3dbf18970be2 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -376,8 +376,33 @@ void RandomDataGenerator::GenerateCSR(
   CHECK_EQ(columns->Size(), value->Size());
 }
 
+namespace {
+void MakeLabels(DeviceOrd device, bst_idx_t n_samples, bst_target_t n_classes,
+                bst_target_t n_targets, std::shared_ptr<DMatrix> out) {
+  RandomDataGenerator gen{n_samples, n_targets, 0.0f};
+  if (n_classes != 0) {
+    gen.Lower(0).Upper(n_classes).GenerateDense(out->Info().labels.Data());
+    out->Info().labels.Reshape(n_samples, n_targets);
+    auto& h_labels = out->Info().labels.Data()->HostVector();
+    for (auto& v : h_labels) {
+      v = static_cast<float>(static_cast<uint32_t>(v));
+    }
+  } else {
+    gen.GenerateDense(out->Info().labels.Data());
+    CHECK_EQ(out->Info().labels.Size(), n_samples * n_targets);
+    out->Info().labels.Reshape(n_samples, n_targets);
+  }
+  if (device.IsCUDA()) {
+    out->Info().labels.Data()->SetDevice(device);
+    out->Info().labels.Data()->ConstDevicePointer();
+    out->Info().feature_types.SetDevice(device);
+    out->Info().feature_types.ConstDevicePointer();
+  }
+}
+}  // namespace
+
 [[nodiscard]] std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(
-    bool with_label, bool float_label, size_t classes, DataSplitMode data_split_mode) const {
+    bool with_label, DataSplitMode data_split_mode) const {
   HostDeviceVector<float> data;
   HostDeviceVector<std::size_t> rptrs;
   HostDeviceVector<bst_feature_t> columns;
@@ -388,19 +413,7 @@ void RandomDataGenerator::GenerateCSR(
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1, "", data_split_mode)};
 
   if (with_label) {
-    RandomDataGenerator gen{rows_, n_targets_, 0.0f};
-    if (!float_label) {
-      gen.Lower(0).Upper(classes).GenerateDense(out->Info().labels.Data());
-      out->Info().labels.Reshape(this->rows_, this->n_targets_);
-      auto& h_labels = out->Info().labels.Data()->HostVector();
-      for (auto& v : h_labels) {
-        v = static_cast<float>(static_cast<uint32_t>(v));
-      }
-    } else {
-      gen.GenerateDense(out->Info().labels.Data());
-      CHECK_EQ(out->Info().labels.Size(), this->rows_ * this->n_targets_);
-      out->Info().labels.Reshape(this->rows_, this->n_targets_);
-    }
+    MakeLabels(this->device_, this->rows_, this->n_classes_, this->n_targets_, out);
   }
   if (device_.IsCUDA()) {
     out->Info().labels.SetDevice(device_);
@@ -435,34 +448,31 @@ void RandomDataGenerator::GenerateCSR(
 #endif  // defined(XGBOOST_USE_CUDA)
   }
 
-  std::unique_ptr<DMatrix> dmat{DMatrix::Create(
+  std::shared_ptr<DMatrix> p_fmat{DMatrix::Create(
       static_cast<DataIterHandle>(iter.get()), iter->Proxy(), Reset, Next,
       std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(), prefix, on_host_)};
 
   auto row_page_path =
-      data::MakeId(prefix, dynamic_cast<data::SparsePageDMatrix*>(dmat.get())) + ".row.page";
+      data::MakeId(prefix, dynamic_cast<data::SparsePageDMatrix*>(p_fmat.get())) + ".row.page";
   EXPECT_TRUE(FileExists(row_page_path)) << row_page_path;
 
   // Loop over the batches and count the number of pages
   std::size_t batch_count = 0;
   bst_idx_t row_count = 0;
-  for (const auto& batch : dmat->GetBatches<xgboost::SparsePage>()) {
+  for (const auto& batch : p_fmat->GetBatches<xgboost::SparsePage>()) {
     batch_count++;
     row_count += batch.Size();
     CHECK_NE(batch.data.Size(), 0);
   }
 
   EXPECT_EQ(batch_count, n_batches_);
-  EXPECT_EQ(dmat->NumBatches(), n_batches_);
-  EXPECT_EQ(row_count, dmat->Info().num_row_);
+  EXPECT_EQ(p_fmat->NumBatches(), n_batches_);
+  EXPECT_EQ(row_count, p_fmat->Info().num_row_);
 
   if (with_label) {
-    RandomDataGenerator{static_cast<bst_idx_t>(dmat->Info().num_row_), this->n_targets_, 0.0f}.GenerateDense(
-        dmat->Info().labels.Data());
-    CHECK_EQ(dmat->Info().labels.Size(), this->rows_ * this->n_targets_);
-    dmat->Info().labels.Reshape(this->rows_, this->n_targets_);
+    MakeLabels(this->device_, this->rows_, this->n_classes_, this->n_targets_, p_fmat);
   }
-  return dmat;
+  return p_fmat;
 }
 
 [[nodiscard]] std::shared_ptr<DMatrix> RandomDataGenerator::GenerateExtMemQuantileDMatrix(
@@ -492,10 +502,7 @@ void RandomDataGenerator::GenerateCSR(
   }
 
   if (with_label) {
-    RandomDataGenerator{static_cast<bst_idx_t>(p_fmat->Info().num_row_), this->n_targets_, 0.0f}
-        .GenerateDense(p_fmat->Info().labels.Data());
-    CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_);
-    p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_);
+    MakeLabels(this->device_, this->rows_, this->n_classes_, this->n_targets_, p_fmat);
   }
   return p_fmat;
 }
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index a8d5f370f3a2..8e4e82a91dc0 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -229,6 +229,7 @@ class RandomDataGenerator {
   float upper_{1.0f};
 
   bst_target_t n_targets_{1};
+  bst_target_t n_classes_{0};
 
   DeviceOrd device_{DeviceOrd::CPU()};
   std::size_t n_batches_{0};
@@ -291,6 +292,10 @@ class RandomDataGenerator {
     n_targets_ = n_targets;
     return *this;
   }
+  RandomDataGenerator& Classes(bst_target_t n_classes) {
+    n_classes_ = n_classes;
+    return *this;
+  }
 
   void GenerateDense(HostDeviceVector<float>* out) const;
 
@@ -315,8 +320,7 @@ class RandomDataGenerator {
                    HostDeviceVector<bst_feature_t>* columns) const;
 
   [[nodiscard]] std::shared_ptr<DMatrix> GenerateDMatrix(
-      bool with_label = false, bool float_label = true, size_t classes = 1,
-      DataSplitMode data_split_mode = DataSplitMode::kRow) const;
+      bool with_label = false, DataSplitMode data_split_mode = DataSplitMode::kRow) const;
 
   [[nodiscard]] std::shared_ptr<DMatrix> GenerateSparsePageDMatrix(std::string prefix,
                                                                    bool with_label) const;
diff --git a/tests/cpp/objective/test_lambdarank_obj.cc b/tests/cpp/objective/test_lambdarank_obj.cc
index 2b34cfa3810a..a9249fc284c4 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cc
+++ b/tests/cpp/objective/test_lambdarank_obj.cc
@@ -119,7 +119,8 @@ void TestUnbiasedNDCG(Context const* ctx) {
   obj->Configure(Args{{"lambdarank_pair_method", "topk"},
                       {"lambdarank_unbiased", "true"},
                       {"lambdarank_bias_norm", "0"}});
-  std::shared_ptr<DMatrix> p_fmat{RandomDataGenerator{10, 1, 0.0f}.GenerateDMatrix(true, false, 2)};
+  std::shared_ptr<DMatrix> p_fmat{
+      RandomDataGenerator{10, 1, 0.0f}.Classes(2).GenerateDMatrix(true)};
   auto h_label = p_fmat->Info().labels.HostView().Values();
   // Move clicked samples to the beginning.
   std::sort(h_label.begin(), h_label.end(), std::greater<>{});
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index ee28adb155c9..2a1b43bf730f 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -61,6 +61,12 @@ TEST(CpuPredictor, ExternalMemory) {
   TestBasic(dmat.get(), &ctx);
 }
 
+TEST_P(ShapExternalMemoryTest, CPUPredictor) {
+  Context ctx;
+  auto [is_qdm, is_interaction] = this->GetParam();
+  this->Run(&ctx, is_qdm, is_interaction);
+}
+
 TEST(CpuPredictor, InplacePredict) {
   bst_idx_t constexpr kRows{128};
   bst_feature_t constexpr kCols{64};
@@ -110,7 +116,7 @@ void TestUpdatePredictionCache(bool use_subsampling) {
   }
   gbm->Configure(args);
 
-  auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(true, true, kClasses);
+  auto dmat = RandomDataGenerator(kRows, kCols, 0).Classes(kClasses).GenerateDMatrix(true);
 
   linalg::Matrix<GradientPair> gpair({kRows, kClasses}, ctx.Device());
   auto h_gpair = gpair.HostView();
@@ -145,7 +151,7 @@ TEST(CPUPredictor, GHistIndexTraining) {
   auto adapter = data::ArrayAdapter(columnar.c_str());
   std::shared_ptr<DMatrix> p_full{
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
-  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_hist, true);
+  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_hist);
 }
 
 TEST(CPUPredictor, CategoricalPrediction) {
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index 5e3021fd71e1..366d0ab6ad39 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2023, XGBoost contributors
+ * Copyright 2017-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/c_api.h>
@@ -17,7 +17,6 @@
 #include "test_predictor.h"
 
 namespace xgboost::predictor {
-
 TEST(GPUPredictor, Basic) {
   auto cpu_lparam = MakeCUDACtx(-1);
   auto gpu_lparam = MakeCUDACtx(0);
@@ -269,10 +268,9 @@ TEST(GPUPredictor, Shap) {
   trees[0]->ExpandNode(0, 0, 0.5, true, 1.0, -1.0, 1.0, 0.0, 5.0, 2.0, 3.0);
   model.CommitModelGroup(std::move(trees), 0);
 
-  auto gpu_lparam = MakeCUDACtx(0);
   auto cpu_lparam = MakeCUDACtx(-1);
-  std::unique_ptr<Predictor> gpu_predictor = std::unique_ptr<Predictor>(
-      Predictor::Create("gpu_predictor", &gpu_lparam));
+  std::unique_ptr<Predictor> gpu_predictor =
+      std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &ctx));
   std::unique_ptr<Predictor> cpu_predictor = std::unique_ptr<Predictor>(
       Predictor::Create("cpu_predictor", &cpu_lparam));
   gpu_predictor->Configure({});
@@ -289,6 +287,12 @@ TEST(GPUPredictor, Shap) {
   }
 }
 
+TEST_P(ShapExternalMemoryTest, GPUPredictor) {
+  auto ctx = MakeCUDACtx(0);
+  auto [is_qdm, is_interaction] = this->GetParam();
+  this->Run(&ctx, is_qdm, is_interaction);
+}
+
 TEST(GPUPredictor, IterationRange) {
   auto ctx = MakeCUDACtx(0);
   TestIterationRange(&ctx);
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index b79b75012267..1af873f58697 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -4,15 +4,16 @@
 #include "test_predictor.h"
 
 #include <gtest/gtest.h>
-#include <xgboost/context.h>                      // for Context
-#include <xgboost/data.h>                         // for DMatrix, BatchIterator, BatchSet, MetaInfo
-#include <xgboost/host_device_vector.h>           // for HostDeviceVector
-#include <xgboost/predictor.h>                    // for PredictionCacheEntry, Predictor, Predic...
-#include <xgboost/string_view.h>                  // for StringView
+#include <xgboost/context.h>             // for Context
+#include <xgboost/data.h>                // for DMatrix, BatchIterator, BatchSet, MetaInfo
+#include <xgboost/host_device_vector.h>  // for HostDeviceVector
+#include <xgboost/json.h>                // for Json
+#include <xgboost/predictor.h>           // for PredictionCacheEntry, Predictor, Predic...
+#include <xgboost/string_view.h>         // for StringView
 
-#include <limits>                                 // for numeric_limits
-#include <memory>                                 // for shared_ptr
-#include <unordered_map>                          // for unordered_map
+#include <limits>         // for numeric_limits
+#include <memory>         // for shared_ptr
+#include <unordered_map>  // for unordered_map
 
 #include "../../../src/common/bitfield.h"         // for LBitField32
 #include "../../../src/data/iterative_dmatrix.h"  // for IterativeDMatrix
@@ -26,7 +27,6 @@
 #include "xgboost/tree_model.h"                   // for RegTree
 
 namespace xgboost {
-
 void TestBasic(DMatrix* dmat, Context const *ctx) {
   auto predictor = std::unique_ptr<Predictor>(CreatePredictorForTest(ctx));
 
@@ -118,8 +118,7 @@ TEST(Predictor, PredictionCache) {
 }
 
 void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins,
-                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist,
-                            bool check_contribs) {
+                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist) {
   size_t constexpr kCols = 16;
   size_t constexpr kClasses = 3;
   size_t constexpr kIters = 3;
@@ -163,34 +162,32 @@ void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins,
     EXPECT_NEAR(from_hist.ConstHostVector()[i], from_full.ConstHostVector()[i], kRtEps);
   }
 
-  if (check_contribs) {
-    // Contributions
-    HostDeviceVector<float> from_full_contribs;
-    learner->Predict(p_full, false, &from_full_contribs, 0, 0, false, false, true);
-    HostDeviceVector<float> from_hist_contribs;
-    learner->Predict(p_hist, false, &from_hist_contribs, 0, 0, false, false, true);
-    for (size_t i = 0; i < from_full_contribs.ConstHostVector().size(); ++i) {
-      EXPECT_NEAR(from_hist_contribs.ConstHostVector()[i],
-                  from_full_contribs.ConstHostVector()[i], kRtEps);
-    }
+  // Contributions
+  HostDeviceVector<float> from_full_contribs;
+  learner->Predict(p_full, false, &from_full_contribs, 0, 0, false, false, true);
+  HostDeviceVector<float> from_hist_contribs;
+  learner->Predict(p_hist, false, &from_hist_contribs, 0, 0, false, false, true);
+  for (size_t i = 0; i < from_full_contribs.ConstHostVector().size(); ++i) {
+    EXPECT_NEAR(from_hist_contribs.ConstHostVector()[i], from_full_contribs.ConstHostVector()[i],
+                kRtEps);
+  }
 
-    // Contributions (approximate method)
-    HostDeviceVector<float> from_full_approx_contribs;
-    learner->Predict(p_full, false, &from_full_approx_contribs, 0, 0, false, false, false, true);
-    HostDeviceVector<float> from_hist_approx_contribs;
-    learner->Predict(p_hist, false, &from_hist_approx_contribs, 0, 0, false, false, false, true);
-    for (size_t i = 0; i < from_full_approx_contribs.ConstHostVector().size(); ++i) {
-      EXPECT_NEAR(from_hist_approx_contribs.ConstHostVector()[i],
-                  from_full_approx_contribs.ConstHostVector()[i], kRtEps);
-    }
+  // Contributions (approximate method)
+  HostDeviceVector<float> from_full_approx_contribs;
+  learner->Predict(p_full, false, &from_full_approx_contribs, 0, 0, false, false, false, true);
+  HostDeviceVector<float> from_hist_approx_contribs;
+  learner->Predict(p_hist, false, &from_hist_approx_contribs, 0, 0, false, false, false, true);
+  for (size_t i = 0; i < from_full_approx_contribs.ConstHostVector().size(); ++i) {
+    EXPECT_NEAR(from_hist_approx_contribs.ConstHostVector()[i],
+                from_full_approx_contribs.ConstHostVector()[i], kRtEps);
   }
 }
 
 void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_idx_t rows,
                            bst_feature_t cols) {
   std::size_t constexpr kClasses { 4 };
-  auto gen = RandomDataGenerator{rows, cols, 0.5}.Device(ctx->Device());
-  std::shared_ptr<DMatrix> m = gen.GenerateDMatrix(true, false, kClasses);
+  auto gen = RandomDataGenerator{rows, cols, 0.5}.Device(ctx->Device()).Classes(kClasses);
+  std::shared_ptr<DMatrix> m = gen.GenerateDMatrix(true);
 
   std::unique_ptr<Learner> learner {
     Learner::Create({m})
@@ -444,7 +441,8 @@ void TestIterationRange(Context const* ctx) {
   size_t constexpr kRows = 1000, kCols = 20, kClasses = 4, kForest = 3, kIters = 10;
   auto dmat = RandomDataGenerator(kRows, kCols, 0)
                   .Device(ctx->Device())
-                  .GenerateDMatrix(true, true, kClasses);
+                  .Classes(kClasses)
+                  .GenerateDMatrix(true);
   auto learner = LearnerForTest(ctx, dmat, kIters, kForest);
 
   bool bound = false;
@@ -515,7 +513,7 @@ void VerifyIterationRangeColumnSplit(bool use_gpu, Json const &ranged_model,
   ctx.UpdateAllowUnknown(
       Args{{"nthread", std::to_string(n_threads)}, {"device", ctx.DeviceName()}});
 
-  auto dmat = RandomDataGenerator(rows, cols, 0).GenerateDMatrix(true, true, classes);
+  auto dmat = RandomDataGenerator(rows, cols, 0).Classes(classes).GenerateDMatrix(true);
   std::shared_ptr<DMatrix> Xy{dmat->SliceCol(world_size, rank)};
 
   std::unique_ptr<Learner> learner{Learner::Create({Xy})};
@@ -566,7 +564,7 @@ void VerifyIterationRangeColumnSplit(bool use_gpu, Json const &ranged_model,
 
 void TestIterationRangeColumnSplit(int world_size, bool use_gpu) {
   std::size_t constexpr kRows = 1000, kCols = 20, kClasses = 4, kForest = 3, kIters = 10;
-  auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(true, true, kClasses);
+  auto dmat = RandomDataGenerator(kRows, kCols, 0).Classes(kClasses).GenerateDMatrix(true);
   Context ctx;
   if (use_gpu) {
     ctx = MakeCUDACtx(0);
@@ -835,4 +833,69 @@ void TestVectorLeafPrediction(Context const *ctx) {
   data.HostVector().assign(data.Size(), model.trees.front()->SplitCond(RegTree::kRoot) - 1.0);
   run_test(1.5, &data);
 }
+
+void ShapExternalMemoryTest::Run(Context const *ctx, bool is_qdm, bool is_interaction) {
+  bst_idx_t n_samples{2048};
+  bst_feature_t n_features{16};
+  bst_target_t n_classes{3};
+  bst_bin_t max_bin{64};
+  auto create_pfmat = [&](RandomDataGenerator &rng) {
+    if (is_qdm) {
+      return rng.Bins(max_bin).GenerateExtMemQuantileDMatrix("temp", true);
+    }
+    return rng.GenerateSparsePageDMatrix("temp", true);
+  };
+  auto p_fmat = create_pfmat(RandomDataGenerator(n_samples, n_features, 0)
+                                 .Batches(1)
+                                 .Device(ctx->Device())
+                                 .Classes(n_classes));
+  std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
+  learner->SetParam("device", ctx->DeviceName());
+  learner->SetParam("base_score", "0.5");
+  learner->SetParam("num_parallel_tree", "3");
+  learner->SetParam("max_bin", std::to_string(max_bin));
+  for (std::int32_t i = 0; i < 4; ++i) {
+    learner->UpdateOneIter(i, p_fmat);
+  }
+  Json model{Object{}};
+  learner->SaveModel(&model);
+  auto j_booster = model["learner"]["gradient_booster"]["model"];
+  auto model_param = MakeMP(n_features, 0.0, n_classes, ctx->Device());
+
+  gbm::GBTreeModel gbtree{&model_param, ctx};
+  gbtree.LoadModel(j_booster);
+
+  std::unique_ptr<Predictor> predictor{
+      Predictor::Create(ctx->IsCPU() ? "cpu_predictor" : "gpu_predictor", ctx)};
+  predictor->Configure({});
+  HostDeviceVector<float> contrib;
+  if (is_interaction) {
+    predictor->PredictInteractionContributions(p_fmat.get(), &contrib, gbtree);
+  } else {
+    predictor->PredictContribution(p_fmat.get(), &contrib, gbtree);
+  }
+
+  auto p_fmat_ext = create_pfmat(RandomDataGenerator(n_samples, n_features, 0)
+                                     .Batches(4)
+                                     .Device(ctx->Device())
+                                     .Classes(n_classes));
+
+  HostDeviceVector<float> contrib_ext;
+  if (is_interaction) {
+    predictor->PredictInteractionContributions(p_fmat_ext.get(), &contrib_ext, gbtree);
+  } else {
+    predictor->PredictContribution(p_fmat_ext.get(), &contrib_ext, gbtree);
+  }
+
+  ASSERT_EQ(contrib_ext.Size(), contrib.Size());
+
+  auto h_contrib = contrib.ConstHostSpan();
+  auto h_contrib_ext = contrib_ext.ConstHostSpan();
+  for (std::size_t i = 0; i < h_contrib.size(); ++i) {
+    ASSERT_EQ(h_contrib[i], h_contrib_ext[i]);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(Predictor, ShapExternalMemoryTest,
+                         ::testing::Combine(::testing::Bool(), ::testing::Bool()));
 }  // namespace xgboost
diff --git a/tests/cpp/predictor/test_predictor.h b/tests/cpp/predictor/test_predictor.h
index 1ccd35102b2d..8f110efe06e8 100644
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -89,8 +89,7 @@ void TestBasic(DMatrix* dmat, Context const * ctx);
 
 // p_full and p_hist should come from the same data set.
 void TestTrainingPrediction(Context const* ctx, size_t rows, size_t bins,
-                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist,
-                            bool check_contribs = false);
+                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist);
 
 void TestInplacePrediction(Context const* ctx, std::shared_ptr<DMatrix> x, bst_idx_t rows,
                            bst_feature_t cols);
@@ -114,6 +113,11 @@ void TestSparsePrediction(Context const* ctx, float sparsity);
 void TestSparsePredictionColumnSplit(int world_size, bool use_gpu, float sparsity);
 
 void TestVectorLeafPrediction(Context const* ctx);
+
+class ShapExternalMemoryTest : public ::testing::TestWithParam<std::tuple<bool, bool>> {
+ public:
+  void Run(Context const* ctx, bool is_qdm, bool is_interaction);
+};
 }  // namespace xgboost
 
 #endif  // XGBOOST_TEST_PREDICTOR_H_
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index a6f3eacecbc5..d53a568d4ed5 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -209,7 +209,7 @@ TEST(Learner, ConfigIO) {
   bst_idx_t n_samples = 128;
   bst_feature_t n_features = 12;
   std::shared_ptr<DMatrix> p_fmat{
-      RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true, false, 2)};
+      RandomDataGenerator{n_samples, n_features, 0}.Classes(2).GenerateDMatrix(true)};
 
   auto serialised_model_tmp = std::string{};
   std::string eval_res_0;
diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py
index 98b60aecfe18..b3ccf4ae5e98 100644
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -343,32 +343,45 @@ def predict_df(x):
         strategies.integers(1, 10), tm.make_dataset_strategy(), shap_parameter_strategy
     )
     @settings(deadline=None, max_examples=20, print_blob=True)
-    def test_shap(self, num_rounds, dataset, param):
+    def test_shap(self, num_rounds: int, dataset: tm.TestDataset, param: dict) -> None:
         if dataset.name.endswith("-l1"):  # not supported by the exact tree method
             return
         param.update({"tree_method": "hist", "device": "gpu:0"})
         param = dataset.set_params(param)
         dmat = dataset.get_dmat()
         bst = xgb.train(param, dmat, num_rounds)
-        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin)
+        test_dmat = xgb.DMatrix(
+            dataset.X, dataset.y, weight=dataset.w, base_margin=dataset.margin
+        )
         bst.set_param({"device": "gpu:0"})
         shap = bst.predict(test_dmat, pred_contribs=True)
         margin = bst.predict(test_dmat, output_margin=True)
         assume(len(dataset.y) > 0)
         assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-3, 1e-3)
 
+        dmat = dataset.get_external_dmat()
+        shap = bst.predict(dmat, pred_contribs=True)
+        margin = bst.predict(dmat, output_margin=True)
+        assume(len(dataset.y) > 0)
+        assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-3, 1e-3)
+
     @given(
         strategies.integers(1, 10), tm.make_dataset_strategy(), shap_parameter_strategy
     )
     @settings(deadline=None, max_examples=10, print_blob=True)
-    def test_shap_interactions(self, num_rounds, dataset, param):
+    def test_shap_interactions(
+        self, num_rounds: int, dataset: tm.TestDataset, param: dict
+    ) -> None:
         if dataset.name.endswith("-l1"):  # not supported by the exact tree method
             return
         param.update({"tree_method": "hist", "device": "cuda:0"})
         param = dataset.set_params(param)
         dmat = dataset.get_dmat()
         bst = xgb.train(param, dmat, num_rounds)
-        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin)
+
+        test_dmat = xgb.DMatrix(
+            dataset.X, dataset.y, weight=dataset.w, base_margin=dataset.margin
+        )
         bst.set_param({"device": "cuda:0"})
         shap = bst.predict(test_dmat, pred_interactions=True)
         margin = bst.predict(test_dmat, output_margin=True)
@@ -380,6 +393,17 @@ def test_shap_interactions(self, num_rounds, dataset, param):
             1e-3,
         )
 
+        test_dmat = dataset.get_external_dmat()
+        shap = bst.predict(test_dmat, pred_interactions=True)
+        margin = bst.predict(test_dmat, output_margin=True)
+        assume(len(dataset.y) > 0)
+        assert np.allclose(
+            np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)),
+            margin,
+            1e-3,
+            1e-3,
+        )
+
     def test_shap_categorical(self):
         X, y = tm.make_categorical(100, 20, 7, False)
         Xy = xgb.DMatrix(X, y, enable_categorical=True)