From d33043a3489f077f6d732b8ca287d38e38700d8c Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 2 Jul 2024 15:34:38 +0800
Subject: [PATCH 01/51] [coll] Allow using local host for testing. (#10526)

- Don't try to retrieve the IP address if a host is specified.
- Fix compiler deprecation warning.
---
 src/collective/tracker.cc          | 10 ++++++----
 src/common/cuda_pinned_allocator.h |  2 ++
 2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/src/collective/tracker.cc b/src/collective/tracker.cc
index 56ec5d546b72..6cb3601db7f4 100644
--- a/src/collective/tracker.cc
+++ b/src/collective/tracker.cc
@@ -111,12 +111,14 @@ RabitTracker::WorkerProxy::WorkerProxy(std::int32_t world, TCPSocket sock, SockA
 }
 
 RabitTracker::RabitTracker(Json const& config) : Tracker{config} {
-  std::string self;
   auto rc = Success() << [&] {
-    return collective::GetHostAddress(&self);
+    host_.clear();
+    host_ = OptionalArg<String>(config, "host", std::string{});
+    if (host_.empty()) {
+      return collective::GetHostAddress(&host_);
+    }
+    return Success();
   } << [&] {
-    host_ = OptionalArg<String>(config, "host", self);
-
     auto addr = MakeSockAddress(xgboost::StringView{host_}, 0);
     listener_ = TCPSocket::Create(addr.IsV4() ? SockDomain::kV4 : SockDomain::kV6);
     return listener_.Bind(host_, &this->port_);
diff --git a/src/common/cuda_pinned_allocator.h b/src/common/cuda_pinned_allocator.h
index d11851d99d37..6fe1757fd369 100644
--- a/src/common/cuda_pinned_allocator.h
+++ b/src/common/cuda_pinned_allocator.h
@@ -61,6 +61,8 @@ class pinned_allocator {
   XGBOOST_DEVICE inline ~pinned_allocator() {} // NOLINT: host/device markup ignored on defaulted functions
   XGBOOST_DEVICE inline pinned_allocator(pinned_allocator const&) {} // NOLINT: host/device markup ignored on defaulted functions
 
+  pinned_allocator& operator=(pinned_allocator const& that) = default;
+  pinned_allocator& operator=(pinned_allocator&& that) = default;
 
   template <typename U>
   XGBOOST_DEVICE inline pinned_allocator(pinned_allocator<U> const&) {} // NOLINT

From e537b0969f2d51b5966f380cf908f2a874c76d5c Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 2 Jul 2024 17:02:54 +0800
Subject: [PATCH 02/51] Fix boolean array for arrow-backed DF. (#10527)

---
 python-package/xgboost/data.py         | 32 ++-----------------------
 python-package/xgboost/testing/data.py | 33 +++++++++++++++-----------
 2 files changed, 21 insertions(+), 44 deletions(-)

diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 7e0ae793ba6e..bd196e2e59f9 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -458,7 +458,7 @@ def pandas_pa_type(ser: Any) -> np.ndarray:
     # combine_chunks takes the most significant amount of time
     chunk: pa.Array = aa.combine_chunks()
     # When there's null value, we have to use copy
-    zero_copy = chunk.null_count == 0
+    zero_copy = chunk.null_count == 0 and not pa.types.is_boolean(chunk.type)
     # Alternately, we can use chunk.buffers(), which returns a list of buffers and
     # we need to concatenate them ourselves.
     # FIXME(jiamingy): Is there a better way to access the arrow buffer along with
@@ -825,37 +825,9 @@ def _arrow_transform(data: DataType) -> Any:
 
     data = cast(pa.Table, data)
 
-    def type_mapper(dtype: pa.DataType) -> Optional[str]:
-        """Maps pyarrow type to pandas arrow extension type."""
-        if pa.types.is_int8(dtype):
-            return pd.ArrowDtype(pa.int8())
-        if pa.types.is_int16(dtype):
-            return pd.ArrowDtype(pa.int16())
-        if pa.types.is_int32(dtype):
-            return pd.ArrowDtype(pa.int32())
-        if pa.types.is_int64(dtype):
-            return pd.ArrowDtype(pa.int64())
-        if pa.types.is_uint8(dtype):
-            return pd.ArrowDtype(pa.uint8())
-        if pa.types.is_uint16(dtype):
-            return pd.ArrowDtype(pa.uint16())
-        if pa.types.is_uint32(dtype):
-            return pd.ArrowDtype(pa.uint32())
-        if pa.types.is_uint64(dtype):
-            return pd.ArrowDtype(pa.uint64())
-        if pa.types.is_float16(dtype):
-            return pd.ArrowDtype(pa.float16())
-        if pa.types.is_float32(dtype):
-            return pd.ArrowDtype(pa.float32())
-        if pa.types.is_float64(dtype):
-            return pd.ArrowDtype(pa.float64())
-        if pa.types.is_boolean(dtype):
-            return pd.ArrowDtype(pa.bool_())
-        return None
-
     # For common cases, this is zero-copy, can check with:
     # pa.total_allocated_bytes()
-    df = data.to_pandas(types_mapper=type_mapper)
+    df = data.to_pandas(types_mapper=pd.ArrowDtype)
     return df
 
 
diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py
index 0c4f290086d1..f4e97e59d363 100644
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@@ -164,10 +164,6 @@ def pd_arrow_dtypes() -> Generator:
 
     # Integer
     dtypes = pandas_pyarrow_mapper
-    Null: Union[float, None, Any] = np.nan
-    orig = pd.DataFrame(
-        {"f0": [1, 2, Null, 3], "f1": [4, 3, Null, 1]}, dtype=np.float32
-    )
     # Create a dictionary-backed dataframe, enable this when the roundtrip is
     # implemented in pandas/pyarrow
     #
@@ -190,24 +186,33 @@ def pd_arrow_dtypes() -> Generator:
     # pd_catcodes = pd_cat_df["f1"].cat.codes
     # assert pd_catcodes.equals(pa_catcodes)
 
-    for Null in (None, pd.NA):
+    for Null in (None, pd.NA, 0):
         for dtype in dtypes:
             if dtype.startswith("float16") or dtype.startswith("bool"):
                 continue
+            # Use np.nan is a baseline
+            orig_null = Null if not pd.isna(Null) and Null == 0 else np.nan
+            orig = pd.DataFrame(
+                {"f0": [1, 2, orig_null, 3], "f1": [4, 3, orig_null, 1]},
+                dtype=np.float32,
+            )
+
             df = pd.DataFrame(
                 {"f0": [1, 2, Null, 3], "f1": [4, 3, Null, 1]}, dtype=dtype
             )
             yield orig, df
 
-    orig = pd.DataFrame(
-        {"f0": [True, False, pd.NA, True], "f1": [False, True, pd.NA, True]},
-        dtype=pd.BooleanDtype(),
-    )
-    df = pd.DataFrame(
-        {"f0": [True, False, pd.NA, True], "f1": [False, True, pd.NA, True]},
-        dtype=pd.ArrowDtype(pa.bool_()),
-    )
-    yield orig, df
+    # If Null is `False`, then there's no missing value.
+    for Null in (pd.NA, False):
+        orig = pd.DataFrame(
+            {"f0": [True, False, Null, True], "f1": [False, True, Null, True]},
+            dtype=pd.BooleanDtype(),
+        )
+        df = pd.DataFrame(
+            {"f0": [True, False, Null, True], "f1": [False, True, Null, True]},
+            dtype=pd.ArrowDtype(pa.bool_()),
+        )
+        yield orig, df
 
 
 def check_inf(rng: RNG) -> None:

From 9cb4c938da8a1e1bcb3794344838687302c21a4e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 3 Jul 2024 03:48:18 +0800
Subject: [PATCH 03/51] [EM] Move prefetch in reset into the end of the
 iteration. (#10529)

---
 src/data/gradient_index_page_source.cc     |  3 +
 src/data/sparse_page_source.h              | 78 ++++++++++++++--------
 tests/cpp/data/test_sparse_page_dmatrix.cc | 29 ++++++--
 tests/cpp/data/test_sparse_page_dmatrix.cu | 71 +++++++++++++++-----
 4 files changed, 133 insertions(+), 48 deletions(-)

diff --git a/src/data/gradient_index_page_source.cc b/src/data/gradient_index_page_source.cc
index f1ceb282a109..0fee1c9fb4b0 100644
--- a/src/data/gradient_index_page_source.cc
+++ b/src/data/gradient_index_page_source.cc
@@ -9,6 +9,9 @@ void GradientIndexPageSource::Fetch() {
     if (count_ != 0 && !sync_) {
       // source is initialized to be the 0th page during construction, so when count_ is 0
       // there's no need to increment the source.
+      //
+      // The mixin doesn't sync the source if `sync_` is false, we need to sync it
+      // ourselves.
       ++(*source_);
     }
     // This is not read from cache so we still need it to be synced with sparse page source.
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index 89aa86ace614..18a149059c11 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -42,7 +42,7 @@ struct Cache {
   std::string name;
   std::string format;
   // offset into binary cache file.
-  std::vector<std::uint64_t> offset;
+  std::vector<bst_idx_t> offset;
 
   Cache(bool w, std::string n, std::string fmt, bool on_host)
       : written{w}, on_host{on_host}, name{std::move(n)}, format{std::move(fmt)} {
@@ -61,7 +61,7 @@ struct Cache {
   /**
    * @brief Record a page with size of n_bytes.
    */
-  void Push(std::size_t n_bytes) { offset.push_back(n_bytes); }
+  void Push(bst_idx_t n_bytes) { offset.push_back(n_bytes); }
   /**
    * @brief Returns the view start and length for the i^th page.
    */
@@ -73,7 +73,7 @@ struct Cache {
   /**
    * @brief Get the number of bytes for the i^th page.
    */
-  [[nodiscard]] std::uint64_t Bytes(std::size_t i) const { return offset.at(i + 1) - offset[i]; }
+  [[nodiscard]] bst_idx_t Bytes(std::size_t i) const { return offset.at(i + 1) - offset[i]; }
   /**
    * @brief Call this once the write for the cache is complete.
    */
@@ -218,7 +218,6 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
   common::Monitor monitor_;
 
   [[nodiscard]] bool ReadCache() {
-    CHECK(!at_end_);
     if (!cache_info_->written) {
       return false;
     }
@@ -259,11 +258,13 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
         return page;
       });
     }
+
     CHECK_EQ(std::count_if(ring_->cbegin(), ring_->cend(), [](auto const& f) { return f.valid(); }),
              n_prefetch_batches)
         << "Sparse DMatrix assumes forward iteration.";
 
     monitor_.Start("Wait");
+    CHECK((*ring_)[count_].valid());
     page_ = (*ring_)[count_].get();
     CHECK(!(*ring_)[count_].valid());
     monitor_.Stop("Wait");
@@ -331,12 +332,28 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
     return at_end_;
   }
 
+  // Call this at the last iteration.
+  void EndIter() {
+    CHECK_EQ(this->cache_info_->offset.size(), this->n_batches_ + 1);
+    this->cache_info_->Commit();
+    if (this->n_batches_ != 0) {
+      CHECK_EQ(this->count_, this->n_batches_);
+    }
+    CHECK_GE(this->count_, 1);
+    this->count_ = 0;
+  }
+
   virtual void Reset() {
     TryLockGuard guard{single_threaded_};
-    at_end_ = false;
-    count_ = 0;
-    // Pre-fetch for the next round of iterations.
-    this->Fetch();
+
+    this->at_end_ = false;
+    auto cnt = this->count_;
+    this->count_ = 0;
+    if (cnt != 0) {
+      // The last iteration did not get to the end, clear the ring to start from 0.
+      this->ring_ = std::make_unique<Ring>();
+      this->Fetch();
+    }
   }
 };
 
@@ -404,16 +421,11 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
     CHECK_LE(count_, n_batches_);
 
     if (at_end_) {
-      CHECK_EQ(cache_info_->offset.size(), n_batches_ + 1);
-      cache_info_->Commit();
-      if (n_batches_ != 0) {
-        CHECK_EQ(count_, n_batches_);
-      }
-      CHECK_GE(count_, 1);
-      proxy_ = nullptr;
-    } else {
-      this->Fetch();
+      this->EndIter();
+      this->proxy_ = nullptr;
     }
+
+    this->Fetch();
     return *this;
   }
 
@@ -446,36 +458,46 @@ class PageSourceIncMixIn : public SparsePageSourceImpl<S, FormatCreatePolicy> {
   PageSourceIncMixIn(float missing, std::int32_t nthreads, bst_feature_t n_features,
                      bst_idx_t n_batches, std::shared_ptr<Cache> cache, bool sync)
       : Super::SparsePageSourceImpl{missing, nthreads, n_features, n_batches, cache}, sync_{sync} {}
-
+  // This function always operate on the source first, then the downstream. The downstream
+  // can assume the source to be ready.
   [[nodiscard]] PageSourceIncMixIn& operator++() final {
     TryLockGuard guard{this->single_threaded_};
+    // Increment the source.
     if (sync_) {
       ++(*source_);
     }
-
+    // Increment self.
     ++this->count_;
+    // Set at end.
     this->at_end_ = this->count_ == this->n_batches_;
 
     if (this->at_end_) {
-      this->cache_info_->Commit();
-      if (this->n_batches_ != 0) {
-        CHECK_EQ(this->count_, this->n_batches_);
+      // If this is the first round of iterations, we have just built the binary cache
+      // from soruce. For a non-sync page type, the source hasn't been updated to the end
+      // iteration yet due to skipped increment. We increment the source here and it will
+      // call the `EndIter` method itself.
+      bool src_need_inc = !sync_ && this->source_->Iter() != 0;
+      if (src_need_inc) {
+        CHECK_EQ(this->source_->Iter(), this->count_ - 1);
+        ++(*source_);
+      }
+      this->EndIter();
+
+      if (src_need_inc) {
+        CHECK(this->cache_info_->written);
       }
-      CHECK_GE(this->count_, 1);
-    } else {
-      this->Fetch();
     }
+    this->Fetch();
 
     if (sync_) {
+      // Sanity check.
       CHECK_EQ(source_->Iter(), this->count_);
     }
     return *this;
   }
 
   void Reset() final {
-    if (sync_) {
-      this->source_->Reset();
-    }
+    this->source_->Reset();
     Super::Reset();
   }
 };
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index 33308be19385..3aeb42abce2b 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -118,7 +118,8 @@ TEST(SparsePageDMatrix, RetainSparsePage) {
 // Test GHistIndexMatrix can avoid loading sparse page after the initialization.
 TEST(SparsePageDMatrix, GHistIndexSkipSparsePage) {
   dmlc::TemporaryDirectory tmpdir;
-  auto Xy = RandomDataGenerator{180, 12, 0.0}.Batches(6).GenerateSparsePageDMatrix(
+  std::size_t n_batches = 6;
+  auto Xy = RandomDataGenerator{180, 12, 0.0}.Batches(n_batches).GenerateSparsePageDMatrix(
       tmpdir.path + "/", true);
   Context ctx;
   bst_bin_t n_bins{256};
@@ -171,12 +172,30 @@ TEST(SparsePageDMatrix, GHistIndexSkipSparsePage) {
     // Restore the batch parameter by passing it in again through check_ghist
     check_ghist();
   }
+
   // half the pages
-  auto it = Xy->GetBatches<SparsePage>(&ctx).begin();
-  for (std::int32_t i = 0; i < 3; ++i) {
-    ++it;
+  {
+    auto it = Xy->GetBatches<SparsePage>(&ctx).begin();
+    for (std::size_t i = 0; i < n_batches / 2; ++i) {
+      ++it;
+    }
+    check_ghist();
+  }
+  {
+    auto it = Xy->GetBatches<GHistIndexMatrix>(&ctx, batch_param).begin();
+    for (std::size_t i = 0; i < n_batches / 2; ++i) {
+      ++it;
+    }
+    check_ghist();
+  }
+  {
+    BatchParam regen{n_bins, common::Span{hess.data(), hess.size()}, true};
+    auto it = Xy->GetBatches<GHistIndexMatrix>(&ctx, regen).begin();
+    for (std::size_t i = 0; i < n_batches / 2; ++i) {
+      ++it;
+    }
+    check_ghist();
   }
-  check_ghist();
 }
 
 TEST(SparsePageDMatrix, MetaInfo) {
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index 7200b96a919c..327f2ba635fd 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -41,31 +41,77 @@ TEST(SparsePageDMatrix, EllpackPage) {
 TEST(SparsePageDMatrix, EllpackSkipSparsePage) {
   // Test Ellpack can avoid loading sparse page after the initialization.
   dmlc::TemporaryDirectory tmpdir;
-  auto Xy = RandomDataGenerator{180, 12, 0.0}.Batches(6).GenerateSparsePageDMatrix(
+  std::size_t n_batches = 6;
+  auto Xy = RandomDataGenerator{180, 12, 0.0}.Batches(n_batches).GenerateSparsePageDMatrix(
       tmpdir.path + "/", true);
   auto ctx = MakeCUDACtx(0);
+  auto cpu = ctx.MakeCPU();
   bst_bin_t n_bins{256};
   double sparse_thresh{0.8};
   BatchParam batch_param{n_bins, sparse_thresh};
 
-  std::int32_t k = 0;
-  for (auto const& page : Xy->GetBatches<EllpackPage>(&ctx, batch_param)) {
-    auto impl = page.Impl();
-    ASSERT_EQ(page.Size(), 30);
-    ASSERT_EQ(k, impl->base_rowid);
-    k += page.Size();
-  }
+  auto check_ellpack = [&]() {
+    std::int32_t k = 0;
+    for (auto const& page : Xy->GetBatches<EllpackPage>(&ctx, batch_param)) {
+      auto impl = page.Impl();
+      ASSERT_EQ(page.Size(), 30);
+      ASSERT_EQ(k, impl->base_rowid);
+      k += page.Size();
+    }
+  };
 
   auto casted = std::dynamic_pointer_cast<data::SparsePageDMatrix>(Xy);
   CHECK(casted);
+  check_ellpack();
+
   // Make the number of fetches don't change (no new fetch)
   auto n_fetches = casted->SparsePageFetchCount();
-  for (std::int32_t i = 0; i < 3; ++i) {
+  for (std::size_t i = 0; i < 3; ++i) {
     for ([[maybe_unused]] auto const& page : Xy->GetBatches<EllpackPage>(&ctx, batch_param)) {
     }
     auto casted = std::dynamic_pointer_cast<data::SparsePageDMatrix>(Xy);
     ASSERT_EQ(casted->SparsePageFetchCount(), n_fetches);
   }
+  check_ellpack();
+
+  dh::device_vector<float> hess(Xy->Info().num_row_, 1.0f);
+  for (std::size_t i = 0; i < 4; ++i) {
+    for ([[maybe_unused]] auto const& page : Xy->GetBatches<SparsePage>(&ctx)) {
+    }
+    for ([[maybe_unused]] auto const& page : Xy->GetBatches<SortedCSCPage>(&cpu)) {
+    }
+    for ([[maybe_unused]] auto const& page : Xy->GetBatches<EllpackPage>(&ctx, batch_param)) {
+    }
+    // Approx tree method pages
+    {
+      BatchParam regen{n_bins, dh::ToSpan(hess), false};
+      for ([[maybe_unused]] auto const& page : Xy->GetBatches<EllpackPage>(&ctx, regen)) {
+      }
+    }
+    {
+      BatchParam regen{n_bins, dh::ToSpan(hess), true};
+      for ([[maybe_unused]] auto const& page : Xy->GetBatches<EllpackPage>(&ctx, regen)) {
+      }
+    }
+
+    check_ellpack();
+  }
+
+  // half the pages
+  {
+    auto it = Xy->GetBatches<SparsePage>(&ctx).begin();
+    for (std::size_t i = 0; i < n_batches / 2; ++i) {
+      ++it;
+    }
+    check_ellpack();
+  }
+  {
+    auto it = Xy->GetBatches<EllpackPage>(&ctx, batch_param).begin();
+    for (std::size_t i = 0; i < n_batches / 2; ++i) {
+      ++it;
+    }
+    check_ellpack();
+  }
 }
 
 TEST(SparsePageDMatrix, MultipleEllpackPages) {
@@ -115,12 +161,7 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
 
   for (size_t i = 0; i < iterators.size(); ++i) {
     ASSERT_EQ((*iterators[i]).Impl()->gidx_buffer.HostVector(), gidx_buffers.at(i).HostVector());
-    if (i != iterators.size() - 1) {
-      ASSERT_EQ(iterators[i].use_count(), 1);
-    } else {
-      // The last batch is still being held by sparse page DMatrix.
-      ASSERT_EQ(iterators[i].use_count(), 2);
-    }
+    ASSERT_EQ(iterators[i].use_count(), 1);
   }
 
   // make sure it's const and the caller can not modify the content of page.

From 628411a654ee3015ad2ab0031a49dbed4a410f75 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 3 Jul 2024 12:13:27 +0800
Subject: [PATCH 04/51] Enhance the threadpool implementation. (#10531)

- Accept an initialization function.
- Support void return tasks.
---
 src/common/threadpool.h             | 30 +++++++++++++++++++----------
 src/data/sparse_page_source.h       |  9 +++++----
 tests/cpp/common/test_threadpool.cc | 26 ++++++++++++++++++++++++-
 3 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/src/common/threadpool.h b/src/common/threadpool.h
index 95d1deaaabc3..21e27aa760a1 100644
--- a/src/common/threadpool.h
+++ b/src/common/threadpool.h
@@ -26,20 +26,25 @@ class ThreadPool {
   bool stop_{false};
 
  public:
-  explicit ThreadPool(std::int32_t n_threads) {
+  /**
+   * @param n_threads The number of threads this pool should hold.
+   * @param init_fn   Function called once during thread creation.
+   */
+  template <typename InitFn>
+  explicit ThreadPool(std::int32_t n_threads, InitFn&& init_fn) {
     for (std::int32_t i = 0; i < n_threads; ++i) {
-      pool_.emplace_back([&] {
+      pool_.emplace_back([&, init_fn = std::forward<InitFn>(init_fn)] {
+        init_fn();
+
         while (true) {
           std::unique_lock lock{mu_};
           cv_.wait(lock, [this] { return !this->tasks_.empty() || stop_; });
 
           if (this->stop_) {
-            if (!tasks_.empty()) {
-              while (!tasks_.empty()) {
-                auto fn = tasks_.front();
-                tasks_.pop();
-                fn();
-              }
+            while (!tasks_.empty()) {
+              auto fn = tasks_.front();
+              tasks_.pop();
+              fn();
             }
             return;
           }
@@ -81,8 +86,13 @@ class ThreadPool {
     // Use shared ptr to make the task copy constructible.
     auto p{std::make_shared<std::promise<R>>()};
     auto fut = p->get_future();
-    auto ffn = std::function{[task = std::move(p), fn = std::move(fn)]() mutable {
-      task->set_value(fn());
+    auto ffn = std::function{[task = std::move(p), fn = std::forward<Fn>(fn)]() mutable {
+      if constexpr (std::is_void_v<R>) {
+        fn();
+        task->set_value();
+      } else {
+        task->set_value(fn());
+      }
     }};
 
     std::unique_lock lock{mu_};
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index 18a149059c11..550631b72dc5 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -236,7 +236,6 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
 
     exce_.Rethrow();
 
-    auto const config = *GlobalConfigThreadLocalStore::Get();
     for (std::int32_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
       fetch_it %= n_batches_;  // ring
       if (ring_->at(fetch_it).valid()) {
@@ -244,8 +243,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
       }
       auto const* self = this;  // make sure it's const
       CHECK_LT(fetch_it, cache_info_->offset.size());
-      ring_->at(fetch_it) = this->workers_.Submit([fetch_it, self, config, this] {
-        *GlobalConfigThreadLocalStore::Get() = config;
+      ring_->at(fetch_it) = this->workers_.Submit([fetch_it, self, this] {
         auto page = std::make_shared<S>();
         this->exce_.Run([&] {
           std::unique_ptr<typename FormatStreamPolicy::FormatT> fmt{this->CreatePageFormat()};
@@ -297,7 +295,10 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
  public:
   SparsePageSourceImpl(float missing, int nthreads, bst_feature_t n_features, bst_idx_t n_batches,
                        std::shared_ptr<Cache> cache)
-      : workers_{std::max(2, std::min(nthreads, 16))},  // Don't use too many threads.
+      : workers_{std::max(2, std::min(nthreads, 16)),
+                 [config = *GlobalConfigThreadLocalStore::Get()] {
+                   *GlobalConfigThreadLocalStore::Get() = config;
+                 }},
         missing_{missing},
         nthreads_{nthreads},
         n_features_{n_features},
diff --git a/tests/cpp/common/test_threadpool.cc b/tests/cpp/common/test_threadpool.cc
index bd54a9dedbe2..ca8a73b55ff6 100644
--- a/tests/cpp/common/test_threadpool.cc
+++ b/tests/cpp/common/test_threadpool.cc
@@ -2,6 +2,7 @@
  * Copyright 2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
+#include <xgboost/global_config.h>  // for GlobalConfigThreadLocalStore
 
 #include <cstddef>  // for size_t
 #include <cstdint>  // for int32_t
@@ -13,7 +14,23 @@
 namespace xgboost::common {
 TEST(ThreadPool, Basic) {
   std::int32_t n_threads = std::thread::hardware_concurrency();
-  ThreadPool pool{n_threads};
+
+  // Set verbosity to 0 for thread-local variable.
+  auto orig = GlobalConfigThreadLocalStore::Get()->verbosity;
+  GlobalConfigThreadLocalStore::Get()->verbosity = 4;
+  // 4 is an invalid value, it's only possible to set it by bypassing the parameter
+  // validation.
+  ASSERT_NE(orig, GlobalConfigThreadLocalStore::Get()->verbosity);
+  ThreadPool pool{n_threads, [config = *GlobalConfigThreadLocalStore::Get()] {
+                    *GlobalConfigThreadLocalStore::Get() = config;
+                  }};
+  GlobalConfigThreadLocalStore::Get()->verbosity = orig;  // restore
+
+  {
+    auto fut = pool.Submit([] { return GlobalConfigThreadLocalStore::Get()->verbosity; });
+    ASSERT_EQ(fut.get(), 4);
+    ASSERT_EQ(GlobalConfigThreadLocalStore::Get()->verbosity, orig);
+  }
   {
     auto fut = pool.Submit([] { return 3; });
     ASSERT_EQ(fut.get(), 3);
@@ -45,5 +62,12 @@ TEST(ThreadPool, Basic) {
       ASSERT_EQ(futures[i].get(), i);
     }
   }
+  {
+    std::int32_t val{0};
+    auto fut = pool.Submit([&] { val = 3; });
+    static_assert(std::is_void_v<decltype(fut.get())>);
+    fut.get();
+    ASSERT_EQ(val, 3);
+  }
 }
 }  // namespace xgboost::common

From 6243e7c43d10041df150e9951256c66fb82e4a60 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 3 Jul 2024 12:16:53 +0800
Subject: [PATCH 05/51] [doc] Update link to release notes. [skip ci] (#10533)

---
 README.md             | 2 +-
 doc/changes/index.rst | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 220e94637fe1..b27cce673585 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@
 [Documentation](https://xgboost.readthedocs.org) |
 [Resources](demo/README.md) |
 [Contributors](CONTRIBUTORS.md) |
-[Release Notes](NEWS.md)
+[Release Notes](https://xgboost.readthedocs.io/en/latest/changes/index.html)
 
 XGBoost is an optimized distributed gradient boosting library designed to be highly ***efficient***, ***flexible*** and ***portable***.
 It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework.
diff --git a/doc/changes/index.rst b/doc/changes/index.rst
index 68eead924d71..09bc215075e4 100644
--- a/doc/changes/index.rst
+++ b/doc/changes/index.rst
@@ -2,6 +2,8 @@
 Release Notes
 #############
 
+For release notes prior to the 2.1 release, please see `news <https://github.com/dmlc/xgboost/blob/master/NEWS.md>`__ .
+
 .. toctree::
   :maxdepth: 1
   :caption: Contents:

From cd1d108c7dfd9e4ff9a0843e5ff55d93a287dd6f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 3 Jul 2024 22:52:26 +0800
Subject: [PATCH 06/51] [doc] Fix learning to rank tutorial. [skip ci] (#10539)

---
 doc/tutorials/learning_to_rank.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/doc/tutorials/learning_to_rank.rst b/doc/tutorials/learning_to_rank.rst
index 74e52e1561aa..4d2cbad4aa47 100644
--- a/doc/tutorials/learning_to_rank.rst
+++ b/doc/tutorials/learning_to_rank.rst
@@ -72,8 +72,11 @@ Please note that, as of writing, there's no learning-to-rank interface in scikit
 .. code-block:: python
 
   import pandas as pd
+
+  # `X`, `qid`, and `y` are from the previous snippet, they are all sorted by the `sorted_idx`.
   df = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
-  df["qid"] = qid[sorted_idx]
+  df["qid"] = qid
+
   ranker.fit(df, y)  # No need to pass qid as a separate argument
 
   from sklearn.model_selection import StratifiedGroupKFold, cross_val_score

From 620b2b155a1878d05960cac3e21e5a7885c97bf3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 4 Jul 2024 15:38:59 +0800
Subject: [PATCH 07/51] Cache GPU histogram kernel configuration. (#10538)

---
 src/tree/gpu_hist/feature_groups.cu       |  13 +-
 src/tree/gpu_hist/histogram.cu            | 155 ++++++++++++++--------
 src/tree/gpu_hist/histogram.cuh           |  47 ++++---
 src/tree/updater_gpu_hist.cu              |  10 +-
 tests/cpp/tree/gpu_hist/test_histogram.cu |  51 ++++---
 tests/cpp/tree/test_gpu_hist.cu           |  33 +++--
 6 files changed, 188 insertions(+), 121 deletions(-)

diff --git a/src/tree/gpu_hist/feature_groups.cu b/src/tree/gpu_hist/feature_groups.cu
index 27ed9bd919c8..52e58da7efbb 100644
--- a/src/tree/gpu_hist/feature_groups.cu
+++ b/src/tree/gpu_hist/feature_groups.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020 by XGBoost Contributors
+/**
+ * Copyright 2020-2024, XGBoost Contributors
  */
 
 #include <xgboost/base.h>
@@ -8,12 +8,9 @@
 
 #include "feature_groups.cuh"
 
-#include "../../common/device_helpers.cuh"
 #include "../../common/hist_util.h"
 
-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 FeatureGroups::FeatureGroups(const common::HistogramCuts& cuts, bool is_dense,
                              size_t shm_size, size_t bin_size) {
   // Only use a single feature group for sparse matrices.
@@ -59,6 +56,4 @@ void FeatureGroups::InitSingle(const common::HistogramCuts& cuts) {
 
   max_group_bins = cuts.TotalBins();
 }
-
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 90c151556566..cd848c1c0cae 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -5,8 +5,7 @@
 #include <thrust/reduce.h>
 
 #include <algorithm>
-#include <cstdint>  // uint32_t
-#include <limits>
+#include <cstdint>  // uint32_t, int32_t
 
 #include "../../collective/aggregator.h"
 #include "../../common/deterministic.cuh"
@@ -128,7 +127,7 @@ XGBOOST_DEV_INLINE void AtomicAddGpairGlobal(xgboost::GradientPairInt64* dest,
 }
 
 template <int kBlockThreads, int kItemsPerThread,
-          int kItemsPerTile = kBlockThreads* kItemsPerThread>
+          int kItemsPerTile = kBlockThreads * kItemsPerThread>
 class HistogramAgent {
   GradientPairInt64* smem_arr_;
   GradientPairInt64* d_node_hist_;
@@ -244,53 +243,82 @@ __global__ void __launch_bounds__(kBlockThreads)
   extern __shared__ char smem[];
   const FeatureGroup group = feature_groups[blockIdx.y];
   auto smem_arr = reinterpret_cast<GradientPairInt64*>(smem);
-  auto agent = HistogramAgent<kBlockThreads, kItemsPerThread>(
-      smem_arr, d_node_hist, group, matrix, d_ridx, rounding, d_gpair);
+  auto agent = HistogramAgent<kBlockThreads, kItemsPerThread>(smem_arr, d_node_hist, group, matrix,
+                                                              d_ridx, rounding, d_gpair);
   if (use_shared_memory_histograms) {
     agent.BuildHistogramWithShared();
   } else {
     agent.BuildHistogramWithGlobal();
   }
 }
+namespace {
+constexpr std::int32_t kBlockThreads = 1024;
+constexpr std::int32_t kItemsPerThread = 8;
+constexpr std::int32_t ItemsPerTile() { return kBlockThreads * kItemsPerThread; }
+}  // namespace
+
+// Use auto deduction guide to workaround compiler error.
+template <auto Global = SharedMemHistKernel<false, kBlockThreads, kItemsPerThread>,
+          auto Shared = SharedMemHistKernel<true, kBlockThreads, kItemsPerThread>>
+struct HistogramKernel {
+  decltype(Global) global_kernel{SharedMemHistKernel<false, kBlockThreads, kItemsPerThread>};
+  decltype(Shared) shared_kernel{SharedMemHistKernel<true, kBlockThreads, kItemsPerThread>};
+  bool shared{false};
+  std::uint32_t grid_size{0};
+  std::size_t smem_size{0};
+
+  HistogramKernel(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
+                  bool force_global_memory) {
+    // Decide whether to use shared memory
+    // Opt into maximum shared memory for the kernel if necessary
+    std::size_t max_shared_memory = dh::MaxSharedMemoryOptin(ctx->Ordinal());
+
+    this->smem_size = sizeof(GradientPairInt64) * feature_groups.max_group_bins;
+    this->shared = !force_global_memory && smem_size <= max_shared_memory;
+    this->smem_size = this->shared ? this->smem_size : 0;
+
+    auto init = [&](auto& kernel) {
+      if (this->shared) {
+        dh::safe_cuda(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                           max_shared_memory));
+      }
 
-void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
-                            FeatureGroupsAccessor const& feature_groups,
-                            common::Span<GradientPair const> gpair,
-                            common::Span<const uint32_t> d_ridx,
-                            common::Span<GradientPairInt64> histogram, GradientQuantiser rounding,
-                            bool force_global_memory) {
-  // decide whether to use shared memory
-  int device = 0;
-  dh::safe_cuda(cudaGetDevice(&device));
-  // opt into maximum shared memory for the kernel if necessary
-  size_t max_shared_memory = dh::MaxSharedMemoryOptin(device);
-
-  size_t smem_size =
-      sizeof(GradientPairInt64) * feature_groups.max_group_bins;
-  bool shared = !force_global_memory && smem_size <= max_shared_memory;
-  smem_size = shared ? smem_size : 0;
-
-  constexpr int kBlockThreads = 1024;
-  constexpr int kItemsPerThread = 8;
-  constexpr int kItemsPerTile = kBlockThreads * kItemsPerThread;
-
-  auto runit = [&, kMinItemsPerBlock = kItemsPerTile](auto kernel) {
-    if (shared) {
-      dh::safe_cuda(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                         max_shared_memory));
-    }
+      // determine the launch configuration
+      std::int32_t num_groups = feature_groups.NumGroups();
+      std::int32_t n_mps = 0;
+      dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, ctx->Ordinal()));
+
+      std::int32_t n_blocks_per_mp = 0;
+      dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
+                                                                  kBlockThreads, this->smem_size));
+
+      // This gives the number of blocks to keep the device occupied Use this as the
+      // maximum number of blocks
+      this->grid_size = n_blocks_per_mp * n_mps;
+    };
+
+    init(this->global_kernel);
+    init(this->shared_kernel);
+  }
+};
+
+class DeviceHistogramBuilderImpl {
+  std::unique_ptr<HistogramKernel<>> kernel_{nullptr};
+  bool force_global_memory_{false};
 
-    // determine the launch configuration
-    int num_groups = feature_groups.NumGroups();
-    int n_mps = 0;
-    dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
-    int n_blocks_per_mp = 0;
-    dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
-                                                                kBlockThreads, smem_size));
-    // This gives the number of blocks to keep the device occupied
-    // Use this as the maximum number of blocks
-    unsigned grid_size = n_blocks_per_mp * n_mps;
+ public:
+  void Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
+             bool force_global_memory) {
+    this->kernel_ = std::make_unique<HistogramKernel<>>(ctx, feature_groups, force_global_memory);
+    this->force_global_memory_ = force_global_memory;
+  }
 
+  void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
+                      FeatureGroupsAccessor const& feature_groups,
+                      common::Span<GradientPair const> gpair,
+                      common::Span<const std::uint32_t> d_ridx,
+                      common::Span<GradientPairInt64> histogram, GradientQuantiser rounding) {
+    CHECK(kernel_);
     // Otherwise launch blocks such that each block has a minimum amount of work to do
     // There are fixed costs to launching each block, e.g. zeroing shared memory
     // The below amount of minimum work was found by experimentation
@@ -300,20 +328,41 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
 
     // Allocate number of blocks such that each block has about kMinItemsPerBlock work
     // Up to a maximum where the device is saturated
-    grid_size = std::min(grid_size, static_cast<std::uint32_t>(
-                                        common::DivRoundUp(items_per_group, kMinItemsPerBlock)));
+    auto constexpr kMinItemsPerBlock = ItemsPerTile();
+    auto grid_size = std::min(kernel_->grid_size, static_cast<std::uint32_t>(common::DivRoundUp(
+                                                      items_per_group, kMinItemsPerBlock)));
+
+    if (this->force_global_memory_ || !this->kernel_->shared) {
+      dh::LaunchKernel{dim3(grid_size, feature_groups.NumGroups()),  // NOLINT
+                       static_cast<uint32_t>(kBlockThreads), kernel_->smem_size,
+                       ctx->Stream()}(kernel_->global_kernel, matrix, feature_groups, d_ridx,
+                                      histogram.data(), gpair.data(), rounding);
+    } else {
+      dh::LaunchKernel{dim3(grid_size, feature_groups.NumGroups()),  // NOLINT
+                       static_cast<uint32_t>(kBlockThreads), kernel_->smem_size,
+                       ctx->Stream()}(kernel_->shared_kernel, matrix, feature_groups, d_ridx,
+                                      histogram.data(), gpair.data(), rounding);
+    }
+  }
+};
 
-    dh::LaunchKernel {dim3(grid_size, num_groups), static_cast<uint32_t>(kBlockThreads), smem_size,
-                     ctx->Stream()} (kernel, matrix, feature_groups, d_ridx, histogram.data(),
-                                     gpair.data(), rounding);
-  };
+DeviceHistogramBuilder::DeviceHistogramBuilder()
+    : p_impl_{std::make_unique<DeviceHistogramBuilderImpl>()} {}
 
-  if (shared) {
-    runit(SharedMemHistKernel<true, kBlockThreads, kItemsPerThread>);
-  } else {
-    runit(SharedMemHistKernel<false, kBlockThreads, kItemsPerThread>);
-  }
+DeviceHistogramBuilder::~DeviceHistogramBuilder() = default;
+
+void DeviceHistogramBuilder::Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
+                                   bool force_global_memory) {
+  this->p_impl_->Reset(ctx, feature_groups, force_global_memory);
+}
 
-  dh::safe_cuda(cudaGetLastError());
+void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx,
+                                            EllpackDeviceAccessor const& matrix,
+                                            FeatureGroupsAccessor const& feature_groups,
+                                            common::Span<GradientPair const> gpair,
+                                            common::Span<const std::uint32_t> ridx,
+                                            common::Span<GradientPairInt64> histogram,
+                                            GradientQuantiser rounding) {
+  this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram, rounding);
 }
 }  // namespace xgboost::tree
diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
index 925c548936f4..e30f682082b7 100644
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -1,17 +1,18 @@
-/*!
- * Copyright 2020-2021 by XGBoost Contributors
+/**
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #ifndef HISTOGRAM_CUH_
 #define HISTOGRAM_CUH_
-#include <thrust/transform.h>
-
-#include "../../common/cuda_context.cuh"
-#include "../../data/ellpack_page.cuh"
-#include "feature_groups.cuh"
+#include <memory>  // for unique_ptr
 
-namespace xgboost {
-namespace tree {
+#include "../../common/cuda_context.cuh"  // for CUDAContext
+#include "../../data/ellpack_page.cuh"    // for EllpackDeviceAccessor
+#include "feature_groups.cuh"             // for FeatureGroupsAccessor
+#include "xgboost/base.h"                 // for GradientPair, GradientPairInt64
+#include "xgboost/context.h"              // for Context
+#include "xgboost/span.h"                 // for Span
 
+namespace xgboost::tree {
 /**
  * \brief An atomicAdd designed for gradient pair with better performance.  For general
  *        int64_t atomicAdd, one can simply cast it to unsigned long long. Exposed for testing.
@@ -32,7 +33,7 @@ XGBOOST_DEV_INLINE void AtomicAdd64As32(int64_t* dst, int64_t src) {
 }
 
 class GradientQuantiser {
-private:
+ private:
   /* Convert gradient to fixed point representation. */
   GradientPairPrecise to_fixed_point_;
   /* Convert fixed point representation back to floating point. */
@@ -59,13 +60,23 @@ private:
   }
 };
 
-void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
-                            FeatureGroupsAccessor const& feature_groups,
-                            common::Span<GradientPair const> gpair,
-                            common::Span<const uint32_t> ridx,
-                            common::Span<GradientPairInt64> histogram, GradientQuantiser rounding,
-                            bool force_global_memory = false);
-}  // namespace tree
-}  // namespace xgboost
+class DeviceHistogramBuilderImpl;
+
+class DeviceHistogramBuilder {
+  std::unique_ptr<DeviceHistogramBuilderImpl> p_impl_;
+
+ public:
+  DeviceHistogramBuilder();
+  ~DeviceHistogramBuilder();
+
+  void Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
+             bool force_global_memory);
+  void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
+                      FeatureGroupsAccessor const& feature_groups,
+                      common::Span<GradientPair const> gpair,
+                      common::Span<const std::uint32_t> ridx,
+                      common::Span<GradientPairInt64> histogram, GradientQuantiser rounding);
+};
+}  // namespace xgboost::tree
 
 #endif  // HISTOGRAM_CUH_
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 958fa0331569..aa4f8fa27218 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -162,6 +162,8 @@ struct GPUHistMakerDevice {
   std::shared_ptr<common::ColumnSampler> column_sampler_;
   MetaInfo const& info_;
 
+  DeviceHistogramBuilder histogram_;
+
  public:
   EllpackPageImpl const* page{nullptr};
   common::Span<FeatureType const> feature_types;
@@ -256,6 +258,8 @@ struct GPUHistMakerDevice {
     hist.Reset();
 
     this->InitFeatureGroupsOnce();
+
+    this->histogram_.Reset(ctx_, feature_groups->DeviceAccessor(ctx_->Device()), false);
   }
 
   GPUExpandEntry EvaluateRootSplit(GradientPairInt64 root_sum) {
@@ -340,9 +344,9 @@ struct GPUHistMakerDevice {
   void BuildHist(int nidx) {
     auto d_node_hist = hist.GetNodeHistogram(nidx);
     auto d_ridx = row_partitioner->GetRows(nidx);
-    BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->Device()),
-                           feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
-                           d_node_hist, *quantiser);
+    this->histogram_.BuildHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->Device()),
+                                    feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
+                                    d_node_hist, *quantiser);
   }
 
   // Attempt to do subtraction trick
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 84cd956db094..3b9e6103a57e 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -1,11 +1,10 @@
 /**
- * Copyright 2020-2023, XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
 #include <vector>
 
-#include "../../../../src/common/categorical.h"
 #include "../../../../src/tree/gpu_hist/histogram.cuh"
 #include "../../../../src/tree/gpu_hist/row_partitioner.cuh"
 #include "../../../../src/tree/param.h"  // TrainParam
@@ -13,7 +12,7 @@
 #include "../../helpers.h"
 
 namespace xgboost::tree {
-void TestDeterministicHistogram(bool is_dense, int shm_size) {
+void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) {
   Context ctx = MakeCUDACtx(0);
   size_t constexpr kBins = 256, kCols = 120, kRows = 16384, kRounds = 16;
   float constexpr kLower = -1e-2, kUpper = 1e2;
@@ -25,35 +24,37 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
   for (auto const& batch : matrix->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
 
-    tree::RowPartitioner row_partitioner(FstCU(), kRows);
+    tree::RowPartitioner row_partitioner(ctx.Device(), kRows);
     auto ridx = row_partitioner.GetRows(0);
 
-    int num_bins = kBins * kCols;
+    bst_bin_t num_bins = kBins * kCols;
     dh::device_vector<GradientPairInt64> histogram(num_bins);
     auto d_histogram = dh::ToSpan(histogram);
     auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
-    gpair.SetDevice(FstCU());
+    gpair.SetDevice(ctx.Device());
 
-    FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size,
-                                 sizeof(GradientPairInt64));
+    FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size, sizeof(GradientPairInt64));
 
     auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
-    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
-                           feature_groups.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
+    DeviceHistogramBuilder builder;
+    builder.Reset(&ctx, feature_groups.DeviceAccessor(ctx.Device()), force_global);
+    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+                           feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                            d_histogram, quantiser);
 
     std::vector<GradientPairInt64> histogram_h(num_bins);
     dh::safe_cuda(cudaMemcpy(histogram_h.data(), d_histogram.data(),
-                             num_bins * sizeof(GradientPairInt64),
-                             cudaMemcpyDeviceToHost));
+                             num_bins * sizeof(GradientPairInt64), cudaMemcpyDeviceToHost));
 
-    for (size_t i = 0; i < kRounds; ++i) {
+    for (std::size_t i = 0; i < kRounds; ++i) {
       dh::device_vector<GradientPairInt64> new_histogram(num_bins);
       auto d_new_histogram = dh::ToSpan(new_histogram);
 
       auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
-      BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
-                             feature_groups.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
+      DeviceHistogramBuilder builder;
+      builder.Reset(&ctx, feature_groups.DeviceAccessor(ctx.Device()), force_global);
+      builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+                             feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                              d_new_histogram, quantiser);
 
       std::vector<GradientPairInt64> new_histogram_h(num_bins);
@@ -68,14 +69,16 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
 
     {
       auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
-      gpair.SetDevice(FstCU());
+      gpair.SetDevice(ctx.Device());
 
       // Use a single feature group to compute the baseline.
       FeatureGroups single_group(page->Cuts());
 
       dh::device_vector<GradientPairInt64> baseline(num_bins);
-      BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
-                             single_group.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
+      DeviceHistogramBuilder builder;
+      builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), force_global);
+      builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+                             single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                              dh::ToSpan(baseline), quantiser);
 
       std::vector<GradientPairInt64> baseline_h(num_bins);
@@ -96,7 +99,9 @@ TEST(Histogram, GPUDeterministic) {
   std::vector<int> shm_sizes{48 * 1024, 64 * 1024, 160 * 1024};
   for (bool is_dense : is_dense_array) {
     for (int shm_size : shm_sizes) {
-      TestDeterministicHistogram(is_dense, shm_size);
+      for (bool force_global : {true, false}) {
+        TestDeterministicHistogram(is_dense, shm_size, force_global);
+      }
     }
   }
 }
@@ -136,7 +141,9 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   for (auto const &batch : cat_m->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
     FeatureGroups single_group(page->Cuts());
-    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+    DeviceHistogramBuilder builder;
+    builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), false);
+    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
                            single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                            dh::ToSpan(cat_hist), quantiser);
   }
@@ -150,7 +157,9 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   for (auto const &batch : encode_m->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
     FeatureGroups single_group(page->Cuts());
-    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+    DeviceHistogramBuilder builder;
+    builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), false);
+    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
                            single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                            dh::ToSpan(encode_hist), quantiser);
   }
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index cc4d9fb7fdad..1c156563cda3 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2023 by XGBoost contributors
+ * Copyright 2017-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/device_vector.h>
@@ -22,12 +22,8 @@
 #include "xgboost/context.h"
 #include "xgboost/json.h"
 
-#if defined(XGBOOST_USE_FEDERATED)
-#include "../plugin/federated/test_worker.h"  // for TestFederatedGlobal
-#endif  // defined(XGBOOST_USE_FEDERATED)
-
 namespace xgboost::tree {
-TEST(GpuHist, DeviceHistogram) {
+TEST(GpuHist, DeviceHistogramStorage) {
   // Ensures that node allocates correctly after reaching `kStopGrowingSize`.
   dh::safe_cuda(cudaSetDevice(0));
   constexpr size_t kNBins = 128;
@@ -102,17 +98,17 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   xgboost::SimpleLCG gen;
   xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
   HostDeviceVector<GradientPair> gpair(kNRows);
-  for (auto &gp : gpair.HostVector()) {
-    bst_float grad = dist(&gen);
-    bst_float hess = dist(&gen);
-    gp = GradientPair(grad, hess);
+  for (auto& gp : gpair.HostVector()) {
+    float grad = dist(&gen);
+    float hess = dist(&gen);
+    gp = GradientPair{grad, hess};
   }
-  gpair.SetDevice(DeviceOrd::CUDA(0));
+  gpair.SetDevice(ctx.Device());
 
-  thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.HostVector());
-  maker.row_partitioner = std::make_unique<RowPartitioner>(FstCU(), kNRows);
+  thrust::host_vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.HostVector());
+  maker.row_partitioner = std::make_unique<RowPartitioner>(ctx.Device(), kNRows);
 
-  maker.hist.Init(FstCU(), page->Cuts().TotalBins());
+  maker.hist.Init(ctx.Device(), page->Cuts().TotalBins());
   maker.hist.AllocateHistograms({0});
 
   maker.gpair = gpair.DeviceSpan();
@@ -121,10 +117,13 @@ void TestBuildHist(bool use_shared_memory_histograms) {
 
   maker.InitFeatureGroupsOnce();
 
-  BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(DeviceOrd::CUDA(0)),
-                         maker.feature_groups->DeviceAccessor(DeviceOrd::CUDA(0)), gpair.DeviceSpan(),
+  DeviceHistogramBuilder builder;
+  builder.Reset(&ctx, maker.feature_groups->DeviceAccessor(ctx.Device()),
+                !use_shared_memory_histograms);
+  builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+                         maker.feature_groups->DeviceAccessor(ctx.Device()), gpair.DeviceSpan(),
                          maker.row_partitioner->GetRows(0), maker.hist.GetNodeHistogram(0),
-                         *maker.quantiser, !use_shared_memory_histograms);
+                         *maker.quantiser);
 
   DeviceHistogramStorage<>& d_hist = maker.hist;
 

From 513d7a7d84678fc4e75c18899fe52263258f0290 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <d.razdoburdin@gmail.com>
Date: Fri, 5 Jul 2024 10:31:48 +0200
Subject: [PATCH 08/51] [sycl] Reorder if-else statements to allow using of cpu
 branches for sycl-devices (#10543)

* reoder if-else statements for sycl compatibility

* trigger check

---------

Co-authored-by: Dmitry Razdoburdin <>
---
 include/xgboost/linalg.h        |  6 ++---
 src/common/random.h             |  6 ++---
 src/common/stats.cc             |  2 +-
 src/data/ellpack_page.cuh       | 10 ++++-----
 src/data/iterative_dmatrix.cc   | 22 +++++++++---------
 src/data/iterative_dmatrix.cu   |  2 +-
 src/data/proxy_dmatrix.cc       |  6 ++---
 src/data/proxy_dmatrix.cu       |  4 ++--
 src/data/simple_dmatrix.cc      |  4 ++--
 src/data/simple_dmatrix.cu      |  2 +-
 src/data/sparse_page_source.cu  |  2 +-
 src/metric/auc.cc               | 40 ++++++++++++++++-----------------
 src/metric/multiclass_metric.cu |  2 +-
 src/metric/survival_metric.cu   |  2 +-
 14 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index cb7668f4cdd1..553486dac330 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -664,13 +664,13 @@ auto MakeVec(T *ptr, size_t s, DeviceOrd device = DeviceOrd::CPU()) {
 
 template <typename T>
 auto MakeVec(HostDeviceVector<T> *data) {
-  return MakeVec(data->Device().IsCPU() ? data->HostPointer() : data->DevicePointer(), data->Size(),
-                 data->Device());
+  return MakeVec(data->Device().IsCUDA() ? data->DevicePointer() : data->HostPointer(),
+                 data->Size(), data->Device());
 }
 
 template <typename T>
 auto MakeVec(HostDeviceVector<T> const *data) {
-  return MakeVec(data->Device().IsCPU() ? data->ConstHostPointer() : data->ConstDevicePointer(),
+  return MakeVec(data->Device().IsCUDA() ? data->ConstDevicePointer() : data->ConstHostPointer(),
                  data->Size(), data->Device());
 }
 
diff --git a/src/common/random.h b/src/common/random.h
index 6d7a1bb499c9..3aed3384a1f6 100644
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -179,14 +179,14 @@ class ColumnSampler {
 
     feature_set_tree_->SetDevice(ctx->Device());
     feature_set_tree_->Resize(num_col);
-    if (ctx->IsCPU()) {
-      std::iota(feature_set_tree_->HostVector().begin(), feature_set_tree_->HostVector().end(), 0);
-    } else {
+    if (ctx->IsCUDA()) {
 #if defined(XGBOOST_USE_CUDA)
       cuda_impl::InitFeatureSet(ctx, feature_set_tree_);
 #else
       AssertGPUSupport();
 #endif
+    } else {
+      std::iota(feature_set_tree_->HostVector().begin(), feature_set_tree_->HostVector().end(), 0);
     }
 
     feature_set_tree_ = ColSample(feature_set_tree_, colsample_bytree_);
diff --git a/src/common/stats.cc b/src/common/stats.cc
index bbf969fcc4d8..72c917bedee1 100644
--- a/src/common/stats.cc
+++ b/src/common/stats.cc
@@ -18,7 +18,7 @@
 namespace xgboost::common {
 void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
             HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
-  if (!ctx->IsCPU()) {
+  if (ctx->IsCUDA()) {
     weights.SetDevice(ctx->Device());
     auto opt_weights = OptionalWeights(weights.ConstDeviceSpan());
     auto t_v = t.View(ctx->Device());
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index 04960458f277..d1f9472df4c4 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -45,17 +45,17 @@ struct EllpackDeviceAccessor {
         n_rows(n_rows),
         gidx_iter(gidx_iter),
         feature_types{feature_types} {
-    if (device.IsCPU()) {
-      gidx_fvalue_map = cuts->cut_values_.ConstHostSpan();
-      feature_segments = cuts->cut_ptrs_.ConstHostSpan();
-      min_fvalue = cuts->min_vals_.ConstHostSpan();
-    } else {
+    if (device.IsCUDA()) {
       cuts->cut_values_.SetDevice(device);
       cuts->cut_ptrs_.SetDevice(device);
       cuts->min_vals_.SetDevice(device);
       gidx_fvalue_map = cuts->cut_values_.ConstDeviceSpan();
       feature_segments = cuts->cut_ptrs_.ConstDeviceSpan();
       min_fvalue = cuts->min_vals_.ConstDeviceSpan();
+    } else {
+      gidx_fvalue_map = cuts->cut_values_.ConstHostSpan();
+      feature_segments = cuts->cut_ptrs_.ConstHostSpan();
+      min_fvalue = cuts->min_vals_.ConstHostSpan();
     }
   }
   // Get a matrix element, uses binary search for look up Return NaN if missing
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index e581e90ca40b..368aeb2ac2fb 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -41,10 +41,10 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
   // hardcoded parameter.
   BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
 
-  if (ctx.IsCPU()) {
-    this->InitFromCPU(&ctx, p, iter_handle, missing, ref);
-  } else {
+  if (ctx.IsCUDA()) {
     this->InitFromCUDA(&ctx, p, iter_handle, missing, ref);
+  } else {
+    this->InitFromCPU(&ctx, p, iter_handle, missing, ref);
   }
 
   this->fmat_ctx_ = ctx;
@@ -73,10 +73,10 @@ void GetCutsFromRef(Context const* ctx, std::shared_ptr<DMatrix> ref, bst_featur
 
   if (ref->PageExists<GHistIndexMatrix>() && ref->PageExists<EllpackPage>()) {
     // Both exists
-    if (ctx->IsCPU()) {
-      csr();
-    } else {
+    if (ctx->IsCUDA()) {
       ellpack();
+    } else {
+      csr();
     }
   } else if (ref->PageExists<GHistIndexMatrix>()) {
     csr();
@@ -84,10 +84,10 @@ void GetCutsFromRef(Context const* ctx, std::shared_ptr<DMatrix> ref, bst_featur
     ellpack();
   } else {
     // None exist
-    if (ctx->IsCPU()) {
-      csr();
-    } else {
+    if (ctx->IsCUDA()) {
       ellpack();
+    } else {
+      csr();
     }
   }
   CHECK_EQ(ref->Info().num_col_, n_features)
@@ -297,9 +297,9 @@ BatchSet<GHistIndexMatrix> IterativeDMatrix::GetGradientIndex(Context const* ctx
   }
 
   if (!ghist_) {
-    if (ctx->IsCPU()) {
+    if (!ctx->IsCUDA()) {
       ghist_ = std::make_shared<GHistIndexMatrix>(ctx, Info(), *ellpack_, param);
-    } else if (fmat_ctx_.IsCPU()) {
+    } else if (!fmat_ctx_.IsCUDA()) {
       ghist_ = std::make_shared<GHistIndexMatrix>(&fmat_ctx_, Info(), *ellpack_, param);
     } else {
       // Can happen when QDM is initialized on GPU, but a CPU version is queried by a different QDM
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 868875bf7d4a..2e8da2c7e7ed 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -46,7 +46,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
   int32_t current_device;
   dh::safe_cuda(cudaGetDevice(&current_device));
   auto get_device = [&]() {
-    auto d = (ctx->IsCPU()) ? DeviceOrd::CUDA(current_device) : ctx->Device();
+    auto d = (ctx->IsCUDA()) ? ctx->Device() : DeviceOrd::CUDA(current_device);
     CHECK(!d.IsCPU());
     return d;
   };
diff --git a/src/data/proxy_dmatrix.cc b/src/data/proxy_dmatrix.cc
index a28448e3b045..bcefb4999c72 100644
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -56,7 +56,9 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
                                                 float missing) {
   bool type_error{false};
   std::shared_ptr<DMatrix> p_fmat{nullptr};
-  if (proxy->Ctx()->IsCPU()) {
+  if (proxy->Ctx()->IsCUDA()) {
+    p_fmat = cuda_impl::CreateDMatrixFromProxy(ctx, proxy, missing);
+  } else {
     p_fmat = data::HostAdapterDispatch<false>(
         proxy.get(),
         [&](auto const &adapter) {
@@ -65,8 +67,6 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
           return p_fmat;
         },
         &type_error);
-  } else {
-    p_fmat = cuda_impl::CreateDMatrixFromProxy(ctx, proxy, missing);
   }
 
   CHECK(p_fmat) << "Failed to fallback.";
diff --git a/src/data/proxy_dmatrix.cu b/src/data/proxy_dmatrix.cu
index cd76e49cf205..fb484f5e31d1 100644
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -11,7 +11,7 @@ void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  if (adapter->Device().IsCPU()) {
+  if (!adapter->Device().IsCUDA()) {
     // empty data
     CHECK_EQ(this->Info().num_row_, 0);
     ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
@@ -25,7 +25,7 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) {
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  if (adapter->Device().IsCPU()) {
+  if (!adapter->Device().IsCUDA()) {
     // empty data
     CHECK_EQ(this->Info().num_row_, 0);
     ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index f54d1c43eda4..e4b82b7de59f 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -185,12 +185,12 @@ BatchSet<GHistIndexMatrix> SimpleDMatrix::GetGradientIndex(Context const* ctx,
     CHECK_GE(param.max_bin, 2);
     // Used only by approx.
     auto sorted_sketch = param.regen;
-    if (ctx->IsCPU()) {
+    if (!ctx->IsCUDA()) {
       // The context passed in is on CPU, we pick it first since we prioritize the context
       // in Booster.
       gradient_index_.reset(new GHistIndexMatrix{ctx, this, param.max_bin, param.sparse_thresh,
                                                  sorted_sketch, param.hess});
-    } else if (fmat_ctx_.IsCPU()) {
+    } else if (!fmat_ctx_.IsCUDA()) {
       // DMatrix was initialized on CPU, we use the context from initialization.
       gradient_index_.reset(new GHistIndexMatrix{&fmat_ctx_, this, param.max_bin,
                                                  param.sparse_thresh, sorted_sketch, param.hess});
diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
index e5b4d18f77db..c177784a36a4 100644
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -19,7 +19,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
                              DataSplitMode data_split_mode) {
   CHECK(data_split_mode != DataSplitMode::kCol)
       << "Column-wise data split is currently not supported on the GPU.";
-  auto device = (adapter->Device().IsCPU() || adapter->NumRows() == 0)
+  auto device = (!adapter->Device().IsCUDA() || adapter->NumRows() == 0)
                     ? DeviceOrd::CUDA(dh::CurrentDevice())
                     : adapter->Device();
   CHECK(device.IsCUDA());
diff --git a/src/data/sparse_page_source.cu b/src/data/sparse_page_source.cu
index 40037eedc0f5..99032aeaad7d 100644
--- a/src/data/sparse_page_source.cu
+++ b/src/data/sparse_page_source.cu
@@ -20,7 +20,7 @@ std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
 
 void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
   auto device = proxy->Device();
-  if (device.IsCPU()) {
+  if (!device.IsCUDA()) {
     device = DeviceOrd::CUDA(dh::CurrentDevice());
   }
   CHECK(device.IsCUDA());
diff --git a/src/metric/auc.cc b/src/metric/auc.cc
index 6de0d1f129cb..fcb774a4aa70 100644
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -336,12 +336,12 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
     double auc{0};
     uint32_t valid_groups = 0;
     auto n_threads = ctx_->Threads();
-    if (ctx_->IsCPU()) {
+    if (ctx_->IsCUDA()) {
       std::tie(auc, valid_groups) =
-          RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads);
+          GPURankingAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
     } else {
       std::tie(auc, valid_groups) =
-          GPURankingAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
+          RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads);
     }
     return std::make_pair(auc, valid_groups);
   }
@@ -351,10 +351,10 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
     double auc{0};
     auto n_threads = ctx_->Threads();
     CHECK_NE(n_classes, 0);
-    if (ctx_->IsCPU()) {
-      auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC);
-    } else {
+    if (ctx_->IsCUDA()) {
       auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes);
+    } else {
+      auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC);
     }
     return auc;
   }
@@ -362,13 +362,13 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
   std::tuple<double, double, double>
   EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
     double fp, tp, auc;
-    if (ctx_->IsCPU()) {
+    if (ctx_->IsCUDA()) {
+      std::tie(fp, tp, auc) =
+          GPUBinaryROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
+    } else {
       std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(),
                                            info.labels.HostView().Slice(linalg::All(), 0),
                                            common::OptionalWeights{info.weights_.ConstHostSpan()});
-    } else {
-      std::tie(fp, tp, auc) =
-          GPUBinaryROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
     }
     return std::make_tuple(fp, tp, auc);
   }
@@ -413,23 +413,23 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
   std::tuple<double, double, double>
   EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
     double pr, re, auc;
-    if (ctx_->IsCPU()) {
+    if (ctx_->IsCUDA()) {
+      std::tie(pr, re, auc) = GPUBinaryPRAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
+    } else {
       std::tie(pr, re, auc) =
           BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
                       common::OptionalWeights{info.weights_.ConstHostSpan()});
-    } else {
-      std::tie(pr, re, auc) = GPUBinaryPRAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
     }
     return std::make_tuple(pr, re, auc);
   }
 
   double EvalMultiClass(HostDeviceVector<float> const &predts, MetaInfo const &info,
                         size_t n_classes) {
-    if (ctx_->IsCPU()) {
+    if (ctx_->IsCUDA()) {
+      return GPUMultiClassPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_, n_classes);
+    } else {
       auto n_threads = this->ctx_->Threads();
       return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC);
-    } else {
-      return GPUMultiClassPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_, n_classes);
     }
   }
 
@@ -438,16 +438,16 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
     double auc{0};
     uint32_t valid_groups = 0;
     auto n_threads = ctx_->Threads();
-    if (ctx_->IsCPU()) {
+    if (ctx_->IsCUDA()) {
+      std::tie(auc, valid_groups) =
+          GPURankingPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_);
+    } else {
       auto labels = info.labels.Data()->ConstHostSpan();
       if (std::any_of(labels.cbegin(), labels.cend(), PRAUCLabelInvalid{})) {
         InvalidLabels();
       }
       std::tie(auc, valid_groups) =
           RankingAUC<false>(ctx_, predts.ConstHostVector(), info, n_threads);
-    } else {
-      std::tie(auc, valid_groups) =
-          GPURankingPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_);
     }
     return std::make_pair(auc, valid_groups);
   }
diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu
index e51509fc7339..70738fdf04e9 100644
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@@ -131,7 +131,7 @@ class MultiClassMetricsReduction {
                             const HostDeviceVector<bst_float>& preds) {
     PackedReduceResult result;
 
-    if (device.IsCPU()) {
+    if (!device.IsCUDA()) {
       result =
           CpuReduceMetrics(weights, labels, preds, n_class, ctx.Threads());
     }
diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu
index 9c57be3ab2b5..d8ef7eb95b5d 100644
--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@@ -127,7 +127,7 @@ class ElementWiseSurvivalMetricsReduction {
       const HostDeviceVector<bst_float>& preds) {
     PackedReduceResult result;
 
-    if (ctx.IsCPU()) {
+    if (!ctx.IsCUDA()) {
       result = CpuReduceMetrics(weights, labels_lower_bound, labels_upper_bound,
                                 preds, ctx.Threads());
     }

From 00264eb72bfe1d5dfc8df00563eaa1c08e6dc15d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 6 Jul 2024 01:15:20 +0800
Subject: [PATCH 09/51] [EM] Basic distributed test for external memory.
 (#10492)

---
 python-package/xgboost/testing/__init__.py    |  7 +-
 tests/ci_build/lint_python.py                 |  1 +
 .../test_with_dask/test_external_memory.py    | 88 +++++++++++++++++++
 3 files changed, 93 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_distributed/test_with_dask/test_external_memory.py

diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 482da68c9fc9..e0096c89c9a8 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -248,13 +248,14 @@ def as_arrays(
         return X, y, w
 
 
-def make_batches(
+def make_batches(  # pylint: disable=too-many-arguments,too-many-locals
     n_samples_per_batch: int,
     n_features: int,
     n_batches: int,
     use_cupy: bool = False,
     *,
     vary_size: bool = False,
+    random_state: int = 1994,
 ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
     X = []
     y = []
@@ -262,9 +263,9 @@ def make_batches(
     if use_cupy:
         import cupy
 
-        rng = cupy.random.RandomState(1994)
+        rng = cupy.random.RandomState(random_state)
     else:
-        rng = np.random.RandomState(1994)
+        rng = np.random.RandomState(random_state)
     for i in range(n_batches):
         n_samples = n_samples_per_batch + i * 10 if vary_size else n_samples_per_batch
         _X = rng.randn(n_samples, n_features)
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 079996de66fb..f8bbbc2848b0 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -98,6 +98,7 @@ class LintersPaths:
         "tests/python/test_model_io.py",
         "tests/test_distributed/test_federated/",
         "tests/test_distributed/test_gpu_federated/",
+        "tests/test_distributed/test_with_dask/test_external_memory.py",
         "tests/test_distributed/test_with_spark/test_data.py",
         "tests/test_distributed/test_gpu_with_spark/test_data.py",
         "tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py",
diff --git a/tests/test_distributed/test_with_dask/test_external_memory.py b/tests/test_distributed/test_with_dask/test_external_memory.py
new file mode 100644
index 000000000000..cf475d90f294
--- /dev/null
+++ b/tests/test_distributed/test_with_dask/test_external_memory.py
@@ -0,0 +1,88 @@
+from typing import List, cast
+
+import numpy as np
+from distributed import Client, Scheduler, Worker, get_worker
+from distributed.utils_test import gen_cluster
+
+import xgboost as xgb
+from xgboost import testing as tm
+from xgboost.compat import concat
+
+
+def run_external_memory(worker_id: int, n_workers: int, comm_args: dict) -> None:
+    n_samples_per_batch = 32
+    n_features = 4
+    n_batches = 16
+    use_cupy = False
+
+    n_threads = get_worker().state.nthreads
+    with xgb.collective.CommunicatorContext(dmlc_communicator="rabit", **comm_args):
+        it = tm.IteratorForTest(
+            *tm.make_batches(
+                n_samples_per_batch,
+                n_features,
+                n_batches,
+                use_cupy,
+                random_state=worker_id,
+            ),
+            cache="cache",
+        )
+        Xy = xgb.DMatrix(it, nthread=n_threads)
+        results: xgb.callback.TrainingCallback.EvalsLog = {}
+        booster = xgb.train(
+            {"tree_method": "hist", "nthread": n_threads},
+            Xy,
+            evals=[(Xy, "Train")],
+            num_boost_round=32,
+            evals_result=results,
+        )
+        assert tm.non_increasing(cast(List[float], results["Train"]["rmse"]))
+
+    lx, ly, lw = [], [], []
+    for i in range(n_workers):
+        x, y, w = tm.make_batches(
+            n_samples_per_batch,
+            n_features,
+            n_batches,
+            use_cupy,
+            random_state=i,
+        )
+        lx.extend(x)
+        ly.extend(y)
+        lw.extend(w)
+
+    X = concat(lx)
+    yconcat = concat(ly)
+    wconcat = concat(lw)
+    Xy = xgb.DMatrix(X, yconcat, wconcat, nthread=n_threads)
+
+    results_local: xgb.callback.TrainingCallback.EvalsLog = {}
+    booster = xgb.train(
+        {"tree_method": "hist", "nthread": n_threads},
+        Xy,
+        evals=[(Xy, "Train")],
+        num_boost_round=32,
+        evals_result=results_local,
+    )
+    np.testing.assert_allclose(
+        results["Train"]["rmse"], results_local["Train"]["rmse"], rtol=1e-4
+    )
+
+
+@gen_cluster(client=True)
+async def test_external_memory(
+    client: Client, s: Scheduler, a: Worker, b: Worker
+) -> None:
+    workers = tm.get_client_workers(client)
+    args = await client.sync(
+        xgb.dask._get_rabit_args,
+        len(workers),
+        None,
+        client,
+    )
+    n_workers = len(workers)
+
+    futs = client.map(
+        run_external_memory, range(n_workers), n_workers=n_workers, comm_args=args
+    )
+    await client.gather(futs)

From 0a3941be6d927f0177a4bfec37524280e32d8aef Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <d.razdoburdin@gmail.com>
Date: Sat, 6 Jul 2024 20:10:54 +0200
Subject: [PATCH 10/51] [sycl] Improve build configuration. (#10548)

Co-authored-by: Dmitry Razdoburdin <>
---
 .github/workflows/main.yml         | 2 +-
 .github/workflows/python_tests.yml | 2 +-
 CMakeLists.txt                     | 2 --
 plugin/CMakeLists.txt              | 4 ++--
 4 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index f5ecb94f68b1..001e17b510a3 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -95,7 +95,7 @@ jobs:
       run: |
         mkdir build
         cd build
-        cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
+        cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
         make -j$(nproc)
     - name: Run gtest binary for SYCL
       run: |
diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index e6eec86c8606..83f0ad495fc3 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -294,7 +294,7 @@ jobs:
       run: |
         mkdir build
         cd build
-        cmake .. -DPLUGIN_SYCL=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
+        cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
         make -j$(nproc)
     - name: Install Python package
       run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bb526ad02911..f7cf8a6cfa87 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,8 +1,6 @@
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 
 if(PLUGIN_SYCL)
-  set(CMAKE_CXX_COMPILER  "g++")
-  set(CMAKE_C_COMPILER  "gcc")
   string(REPLACE " -isystem ${CONDA_PREFIX}/include" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif()
 
diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt
index 5d20e120e902..c0c31f3a100d 100644
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -10,14 +10,14 @@ if(PLUGIN_SYCL)
     target_compile_definitions(plugin_sycl PUBLIC -DXGBOOST_USE_SYCL=1)
     target_link_libraries(plugin_sycl PUBLIC -fsycl)
     set_target_properties(plugin_sycl PROPERTIES
-    COMPILE_FLAGS -fsycl
+    COMPILE_FLAGS "-fsycl -fno-sycl-id-queries-fit-in-int"
     CXX_STANDARD 17
     CXX_STANDARD_REQUIRED ON
     POSITION_INDEPENDENT_CODE ON)
   if(USE_OPENMP)
     find_package(OpenMP REQUIRED)
     set_target_properties(plugin_sycl PROPERTIES
-    COMPILE_FLAGS "-fsycl -qopenmp")
+    COMPILE_FLAGS "-fsycl -fno-sycl-id-queries-fit-in-int -qopenmp")
   endif()
   # Get compilation and link flags of plugin_sycl and propagate to objxgboost
   target_link_libraries(objxgboost PUBLIC plugin_sycl)

From 2266db17d15359c3a19d873a9cfa0e794c50d7d9 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 8 Jul 2024 17:02:46 +0800
Subject: [PATCH 11/51] [R] Update roxygen. (#10556)

---
 R-package/DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 82d7011de3a4..98d31acf8c6c 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -66,6 +66,6 @@ Imports:
     data.table (>= 1.9.6),
     jsonlite (>= 1.0)
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Encoding: UTF-8
 SystemRequirements: GNU make, C++17

From 8d0f2bfbaabe85a74dc375fc3e3d5c4d71906ed3 Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Mon, 8 Jul 2024 13:17:31 +0200
Subject: [PATCH 12/51] [doc] Add more detailed explanations for advanced
 objectives (#10283)

---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 R-package/R/xgb.train.R               |  12 +
 R-package/man/xgb.train.Rd            |  12 +
 demo/guide-python/custom_softmax.py   |   9 +-
 doc/tutorials/advanced_custom_obj.rst | 720 ++++++++++++++++++++++++++
 doc/tutorials/custom_metric_obj.rst   |   5 +-
 doc/tutorials/index.rst               |   1 +
 python-package/xgboost/sklearn.py     |   5 +
 7 files changed, 760 insertions(+), 4 deletions(-)
 create mode 100644 doc/tutorials/advanced_custom_obj.rst

diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
index 0aa3cdcf1df0..30bf1f1ea149 100644
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -102,6 +102,18 @@
 #'            It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be
 #'            \href{https://en.wikipedia.org/wiki/Tweedie_distribution#Applications}{Tweedie-distributed}.}
 #'   }
+#'
+#'  For custom objectives, one should pass a function taking as input the current predictions (as a numeric
+#'  vector or matrix) and the training data (as an `xgb.DMatrix` object) that will return a list with elements
+#'  `grad` and `hess`, which should be numeric vectors or matrices with number of rows matching to the numbers
+#'  of rows in the training data (same shape as the predictions that are passed as input to the function).
+#'  For multi-valued custom objectives, should have shape `[nrows, ntargets]`. Note that negative values of
+#'  the Hessian will be clipped, so one might consider using the expected Hessian (Fisher information) if the
+#'  objective is non-convex.
+#'
+#'  See the tutorials \href{https://xgboost.readthedocs.io/en/stable/tutorials/custom_metric_obj.html}{
+#'  Custom Objective and Evaluation Metric} and \href{https://xgboost.readthedocs.io/en/stable/tutorials/advanced_custom_obj}{
+#'  Advanced Usage of Custom Objectives} for more information about custom objectives.
 #'  }
 #'   \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
 #'   \item{ \code{eval_metric} evaluation metrics for validation data.
diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd
index 937020e0dd38..f641b1374420 100644
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -144,6 +144,18 @@ It might be useful, e.g., for modeling insurance claims severity, or for any out
 It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be
 \href{https://en.wikipedia.org/wiki/Tweedie_distribution#Applications}{Tweedie-distributed}.}
 }
+
+For custom objectives, one should pass a function taking as input the current predictions (as a numeric
+vector or matrix) and the training data (as an \code{xgb.DMatrix} object) that will return a list with elements
+\code{grad} and \code{hess}, which should be numeric vectors or matrices with number of rows matching to the numbers
+of rows in the training data (same shape as the predictions that are passed as input to the function).
+For multi-valued custom objectives, should have shape \verb{[nrows, ntargets]}. Note that negative values of
+the Hessian will be clipped, so one might consider using the expected Hessian (Fisher information) if the
+objective is non-convex.
+
+See the tutorials \href{https://xgboost.readthedocs.io/en/stable/tutorials/custom_metric_obj.html}{
+Custom Objective and Evaluation Metric} and \href{https://xgboost.readthedocs.io/en/stable/tutorials/advanced_custom_obj}{
+Advanced Usage of Custom Objectives} for more information about custom objectives.
 }
 \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
 \item{ \code{eval_metric} evaluation metrics for validation data.
diff --git a/demo/guide-python/custom_softmax.py b/demo/guide-python/custom_softmax.py
index 36265cf4d6c5..2d2ebae2041b 100644
--- a/demo/guide-python/custom_softmax.py
+++ b/demo/guide-python/custom_softmax.py
@@ -6,7 +6,8 @@
 XGBoost returns transformed prediction for multi-class objective function.  More details
 in comments.
 
-See :doc:`/tutorials/custom_metric_obj` for detailed tutorial and notes.
+See :doc:`/tutorials/custom_metric_obj` and :doc:`/tutorials/advanced_custom_obj` for
+detailed tutorial and notes.
 
 '''
 
@@ -39,7 +40,9 @@ def softmax(x):
 
 
 def softprob_obj(predt: np.ndarray, data: xgb.DMatrix):
-    '''Loss function.  Computing the gradient and approximated hessian (diagonal).
+    '''Loss function. Computing the gradient and upper bound on the
+    Hessian with a diagonal structure for XGBoost (note that this is
+    not the true Hessian).
     Reimplements the `multi:softprob` inside XGBoost.
 
     '''
@@ -61,7 +64,7 @@ def softprob_obj(predt: np.ndarray, data: xgb.DMatrix):
 
     eps = 1e-6
 
-    # compute the gradient and hessian, slow iterations in Python, only
+    # compute the gradient and hessian upper bound, slow iterations in Python, only
     # suitable for demo.  Also the one in native XGBoost core is more robust to
     # numeric overflow as we don't do anything to mitigate the `exp` in
     # `softmax` here.
diff --git a/doc/tutorials/advanced_custom_obj.rst b/doc/tutorials/advanced_custom_obj.rst
new file mode 100644
index 000000000000..b78cdc292eff
--- /dev/null
+++ b/doc/tutorials/advanced_custom_obj.rst
@@ -0,0 +1,720 @@
+###################################
+Advanced Usage of Custom Objectives
+###################################
+
+**Contents**
+
+.. contents::
+  :backlinks: none
+  :local:
+
+********
+Overview
+********
+
+XGBoost allows optimizing custom user-defined functions based on
+gradients and Hessians provided by the user for the desired objective function.
+
+In order for a custom objective to work as intended:
+
+- The function to optimize must be smooth and twice differentiable.
+- The function must be additive with respect to rows / observations,
+  such as a likelihood function with i.i.d. assumptions.
+- The range of the scores for the function must be unbounded
+  (i.e. it should not work exclusively with positive numbers, for example).
+- The function must be convex. Note that, if the Hessian has negative
+  values, they will be clipped, which will likely result in a model
+  that does not fit the function well.
+- For multi-output objectives, there should not be dependencies between
+  different targets (i.e. Hessian should be diagonal for each row).
+
+
+Some of these limitations can nevertheless be worked around by foregoing
+the true Hessian of the function, using something else instead such as an
+approximation with better properties - convergence might be slower when
+not using the true Hessian of a function, but many theoretical guarantees
+should still hold and result in usable models. For example, XGBoost's
+internal implementation of multionomial logistic regression uses an upper
+bound on the Hessian with diagonal structure instead of the true Hessian
+which is a full square matrix for each row in the data.
+
+This tutorial provides some suggestions for use-cases that do not perfectly
+fit the criteria outlined above, by showing how to solve a Dirichlet regression
+parameterized by concentrations.
+
+A Dirichlet regression model poses certain challenges for XGBoost:
+
+- Concentration parameters must be positive. An easy way to achieve this is
+  by applying an 'exp' transform on raw unbounded values, but in such case
+  the objective becomes non-convex. Furthermore, note that this function is
+  not in the exponential family, unlike typical distributions used for GLM
+  models.
+- The Hessian has dependencies between targets - that is, for a Dirichlet
+  distribution with 'k' parameters, each row will have a full Hessian matrix
+  of dimensions ``[k, k]``.
+- An optimal intercept for this type of model would involve a vector of
+  values rather than the same value for every target.
+
+In order to use this type of model as a custom objetive:
+
+- It's possible to use the expected Hessian (a.k.a. the Fisher information
+  matrix or expected information) instead of the true Hessian. The expected
+  Hessian is always positive semi-definite for an additive likelihood, even
+  if the true Hessian isn't.
+- It's possible to use an upper bound on the expected Hessian with a diagonal
+  structure, such that a second-order approximation under this diagonal
+  bound would always yield greater or equal function values than under the
+  non-diagonal expected Hessian.
+- Since the ``base_score`` parameter that XGBoost uses for an intercept is
+  limited to a scalar, one can use the ``base_margin`` functionality instead,
+  but note that using it requires a bit more effort.
+
+*****************************
+Dirichlet Regression Formulae
+*****************************
+
+The Dirichlet distribution is a generalization of the Beta distribution to
+multiple dimensions. It models proportions data in which the values sum to
+1, and is typically used as part of composite models (e.g. Dirichlet-multinomial)
+or as a prior in Bayesian models, but it also can be used on its own for
+proportions data for example.
+
+Its likelihood for a given observation with values ``y`` and a given prediction ``x``
+is given as follows:
+
+.. math::
+    L(\mathbf{y} | \mathbf{x}) = \frac{1}{\beta(\mathbf{x})} \prod_{i=1}^k y_i^{x_i - 1}
+
+Where:
+
+.. math::
+  \beta(\mathbf{x}) = \frac{ \prod_{i=1}^k \Gamma(x_i) }{\Gamma( \sum_{i=1}^k x_i )}
+
+
+In this case, we want to optimize the negative of the log-likelihood summed across rows.
+The resulting function, gradient and Hessian could be implemented as follows:
+
+.. code-block:: python
+    :caption: Python
+
+    import numpy as np
+    from scipy.special import loggamma, psi as digamma, polygamma
+    trigamma = lambda x: polygamma(1, x)
+
+    def dirichlet_fun(pred: np.ndarray, Y: np.ndarray) -> float:
+        epred = np.exp(pred)
+        sum_epred = np.sum(epred, axis=1, keepdims=True)
+        return (
+            loggamma(epred).sum()
+            - loggamma(sum_epred).sum()
+            - np.sum(np.log(Y) * (epred - 1))
+        )
+    def dirichlet_grad(pred: np.ndarray, Y: np.ndarray) -> np.ndarray:
+        epred = np.exp(pred)
+        return epred * (
+            digamma(epred)
+            - digamma(np.sum(epred, axis=1, keepdims=True))
+            - np.log(Y)
+        )
+    def dirichlet_hess(pred: np.ndarray, Y: np.ndarray) -> np.ndarray:
+        epred = np.exp(pred)
+        grad = dirichlet_grad(pred, Y)
+        k = Y.shape[1]
+        H = np.empty((pred.shape[0], k, k))
+        for row in range(pred.shape[0]):
+            H[row, :, :] = (
+                - trigamma(epred[row].sum()) * np.outer(epred[row], epred[row])
+                + np.diag(grad[row] + trigamma(epred[row]) * epred[row] ** 2)
+            )
+        return H
+
+.. code-block:: r
+    :caption: R
+
+    softmax <- function(x) {
+        max.x <- max(x)
+        e <- exp(x - max.x)
+        return(e / sum(e))
+    }
+
+    dirichlet.fun <- function(pred, y) {
+        epred <- exp(pred)
+        sum_epred <- rowSums(epred)
+        return(
+            sum(lgamma(epred))
+            - sum(lgamma(sum_epred))
+            - sum(log(y) * (epred - 1))
+        )
+    }
+
+    dirichlet.grad <- function(pred, y) {
+        epred <- exp(pred)
+        return(
+            epred * (
+                digamma(epred)
+                - digamma(rowSums(epred))
+                - log(y)
+            )
+        )
+    }
+
+    dirichlet.hess <- function(pred, y) {
+        epred <- exp(pred)
+        grad <- dirichlet.grad(pred, y)
+        k <- ncol(y)
+        H <- array(dim = c(nrow(y), k, k))
+        for (row in seq_len(nrow(y))) {
+            H[row, , ] <- (
+                - trigamma(sum(epred[row,])) * tcrossprod(epred[row,])
+                + diag(grad[row,] + trigamma(epred[row,]) * epred[row,]^2)
+            )
+        }
+        return(H)
+    }
+
+
+Convince yourself that the implementation is correct:
+
+.. code-block:: python
+    :caption: Python
+
+    from math import isclose
+    from scipy import stats
+    from scipy.optimize import check_grad
+    from scipy.special import softmax
+
+    def gen_random_dirichlet(rng: np.random.Generator, m: int, k: int):
+        alpha = np.exp(rng.standard_normal(size=k))
+        return rng.dirichlet(alpha, size=m)
+    
+    def test_dirichlet_fun_grad_hess():
+        k = 3
+        m = 10
+        rng = np.random.default_rng(seed=123)
+        Y = gen_random_dirichlet(rng, m, k)
+        x0 = rng.standard_normal(size=k)
+        for row in range(Y.shape[0]):
+            fun_row = dirichlet_fun(x0.reshape((1,-1)), Y[[row]])
+            ref_logpdf = stats.dirichlet.logpdf(
+                Y[row] / Y[row].sum(), # <- avoid roundoff error
+                np.exp(x0),
+            )
+            assert isclose(fun_row, -ref_logpdf)
+
+            gdiff = check_grad(
+                lambda pred: dirichlet_fun(pred.reshape((1,-1)), Y[[row]]),
+                lambda pred: dirichlet_grad(pred.reshape((1,-1)), Y[[row]]),
+                x0
+            )
+            assert gdiff <= 1e-6
+
+            H_numeric = np.empty((k,k))
+            eps = 1e-7
+            for ii in range(k):
+                x0_plus_eps = x0.reshape((1,-1)).copy()
+                x0_plus_eps[0,ii] += eps
+                for jj in range(k):
+                    H_numeric[ii, jj] = (
+                        dirichlet_grad(x0_plus_eps, Y[[row]])[0][jj]
+                        - dirichlet_grad(x0.reshape((1,-1)), Y[[row]])[0][jj]
+                    ) / eps
+            H = dirichlet_hess(x0.reshape((1,-1)), Y[[row]])[0]
+            np.testing.assert_almost_equal(H, H_numeric, decimal=6)
+    test_dirichlet_fun_grad_hess()
+
+
+.. code-block:: r
+    :caption: R
+
+    library(DirichletReg)
+    library(testthat)
+
+    test_that("dirichlet formulae", {
+        k <- 3L
+        m <- 10L
+        set.seed(123)
+        alpha <- exp(rnorm(k))
+        y <- rdirichlet(m, alpha)
+        x0 <- rnorm(k)
+        
+        for (row in seq_len(m)) {
+            logpdf <- dirichlet.fun(matrix(x0, nrow=1), y[row,,drop=F])
+            ref_logpdf <- ddirichlet(y[row,,drop=F], exp(x0), log = T)
+            expect_equal(logpdf, -ref_logpdf)
+            
+            eps <- 1e-7
+            grad_num <- numeric(k)
+            for (col in seq_len(k)) {
+                xplus <- x0
+                xplus[col] <- x0[col] + eps
+                grad_num[col] <- (
+                    dirichlet.fun(matrix(xplus, nrow=1), y[row,,drop=F])
+                    - dirichlet.fun(matrix(x0, nrow=1), y[row,,drop=F])
+                ) / eps
+            }
+            
+            grad <- dirichlet.grad(matrix(x0, nrow=1), y[row,,drop=F])
+            expect_equal(grad |> as.vector(), grad_num, tolerance=1e-6)
+            
+            H_numeric <- array(dim=c(k, k))
+            for (ii in seq_len(k)) {
+                xplus <- x0
+                xplus[ii] <- x0[ii] + eps
+                for (jj in seq_len(k)) {
+                    H_numeric[ii, jj] <- (
+                        dirichlet.grad(matrix(xplus, nrow=1), y[row,,drop=F])[1, jj]
+                        - grad[1L, jj]
+                    ) / eps
+                }
+            }
+            
+            H <- dirichlet.hess(matrix(xplus, nrow=1), y[row,,drop=F])
+            expect_equal(H[1,,], H_numeric, tolerance=1e-6)
+        }
+    })
+
+******************************************
+Dirichlet Regression as Objective Function
+******************************************
+
+As mentioned earlier, the Hessian of this function is problematic for
+XGBoost: it can have a negative determinant, and might even have negative
+values in the diagonal, which is problematic for optimization methods - in
+XGBoost, those values would be clipped and the resulting model might not
+end up producing sensible predictions.
+
+A potential workaround is to use the expected Hessian instead - that is,
+the expected outer product of the gradient if the response variable were
+distributed according to what is predicted. See the Wikipedia article
+for more information:
+
+`<https://en.wikipedia.org/wiki/Fisher_information>`_
+
+In general, for objective functions in the exponential family, this is easy
+to obtain from the gradient of the link function and the variance of the
+probability distribution, but for other functions in general, it might
+involve other types of calculations (e.g. covariances and covariances of
+logarithms for Dirichlet).
+
+It nevertheless results in a form very similar to the Hessian. One can also
+see from the differences here that, at an optimal point (gradient being zero),
+the expected and true Hessian for Dirichlet will match, which is a nice
+property for optimization (i.e. the Hessian will be positive at a stationary
+point, which means it will be a minimum rather than a maximum or saddle point).
+
+.. code-block:: python
+    :caption: Python
+
+    def dirichlet_expected_hess(pred: np.ndarray) -> np.ndarray:
+        epred = np.exp(pred)
+        k = pred.shape[1]
+        Ehess = np.empty((pred.shape[0], k, k))
+        for row in range(pred.shape[0]):
+            Ehess[row, :, :] = (
+                - trigamma(epred[row].sum()) * np.outer(epred[row], epred[row])
+                + np.diag(trigamma(epred[row]) * epred[row] ** 2)
+            )
+        return Ehess
+    def test_dirichlet_expected_hess():
+        k = 3
+        rng = np.random.default_rng(seed=123)
+        x0 = rng.standard_normal(size=k)
+        y_sample = rng.dirichlet(np.exp(x0), size=5_000_000)
+        x_broadcast = np.broadcast_to(x0, (y_sample.shape[0], k))
+        g_sample = dirichlet_grad(x_broadcast, y_sample)
+        ref = (g_sample.T @ g_sample) / y_sample.shape[0]
+        Ehess = dirichlet_expected_hess(x0.reshape((1,-1)))[0]
+        np.testing.assert_almost_equal(Ehess, ref, decimal=2)
+    test_dirichlet_expected_hess()
+
+.. code-block:: r
+    :caption: R
+
+    dirichlet.expected.hess <- function(pred) {
+        epred <- exp(pred)
+        k <- ncol(pred)
+        H <- array(dim = c(nrow(pred), k, k))
+        for (row in seq_len(nrow(pred))) {
+            H[row, , ] <- (
+                - trigamma(sum(epred[row,])) * tcrossprod(epred[row,])
+                + diag(trigamma(epred[row,]) * epred[row,]^2)
+            )
+        }
+        return(H)
+    }
+
+    test_that("expected hess", {
+        k <- 3L
+        set.seed(123)
+        x0 <- rnorm(k)
+        alpha <- exp(x0)
+        n.samples <- 5e6
+        y.samples <- rdirichlet(n.samples, alpha)
+        
+        x.broadcast <- rep(x0, n.samples) |> matrix(ncol=k, byrow=T)
+        grad.samples <- dirichlet.grad(x.broadcast, y.samples)
+        ref <- crossprod(grad.samples) / n.samples
+        Ehess <- dirichlet.expected.hess(matrix(x0, nrow=1))
+        expect_equal(Ehess[1,,], ref, tolerance=1e-2)
+    })
+
+But note that this is still not usable for XGBoost, since the expected
+Hessian, just like the true Hessian, has shape ``[nrows, k, k]``, while
+XGBoost requires something with shape ``[k, k]``.
+
+One may use the diagonal of the expected Hessian for each row, but it's
+possible to do better: one can use instead an upper bound with diagonal
+structure, since it should lead to better convergence properties, just like
+for other Hessian-based optimization methods.
+
+In the absence of any obvious way of obtaining an upper bound, a possibility
+here is to construct such a bound numerically based directly on the definition
+of a diagonally dominant matrix:
+
+`<https://en.wikipedia.org/wiki/Diagonally_dominant_matrix>`_
+
+That is: take the absolute value of the expected Hessian for each row of the data,
+and sum by rows of the ``[k, k]``-shaped Hessian for that row in the data:
+
+.. code-block:: python
+    :caption: Python
+
+    def dirichlet_diag_upper_bound_expected_hess(
+        pred: np.ndarray, Y: np.ndarray
+    ) -> np.ndarray:
+        Ehess = dirichlet_expected_hess(pred)
+        diag_bound_Ehess = np.empty((pred.shape[0], Y.shape[1]))
+        for row in range(pred.shape[0]):
+            diag_bound_Ehess[row, :] = np.abs(Ehess[row, :, :]).sum(axis=1)
+        return diag_bound_Ehess
+
+.. code-block:: r
+    :caption: R
+
+    dirichlet.diag.upper.bound.expected.hess <- function(pred, y) {
+        Ehess <- dirichlet.expected.hess(pred)
+        diag.bound.Ehess <- array(dim=dim(pred))
+        for (row in seq_len(nrow(pred))) {
+            diag.bound.Ehess[row,] <- abs(Ehess[row,,]) |> rowSums()
+        }
+        return(diag.bound.Ehess)
+    }
+
+(*note: the calculation can be made more efficiently than what is shown here
+by not calculating the full matrix, and in R, by making the rows be the last
+dimension and transposing after the fact*)
+
+With all these pieces in place, one can now frame this model into the format
+required for XGBoost's custom objectives:
+
+.. code-block:: python
+    :caption: Python
+
+    import xgboost as xgb
+    from typing import Tuple
+
+    def dirichlet_xgb_objective(
+        pred: np.ndarray, dtrain: xgb.DMatrix
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        Y = dtrain.get_label().reshape(pred.shape)
+        return (
+            dirichlet_grad(pred, Y),
+            dirichlet_diag_upper_bound_expected_hess(pred, Y),
+        )
+
+.. code-block:: r
+    :caption: R
+
+    library(xgboost)
+    
+    dirichlet.xgb.objective <- function(pred, dtrain) {
+        y <- getinfo(dtrain, "label")
+        return(
+            list(
+                grad = dirichlet.grad(pred, y),
+                hess = dirichlet.diag.upper.bound.expected.hess(pred, y)
+            )
+        )
+    }
+
+And for an evaluation metric monitoring based on the Dirichlet log-likelihood:
+
+.. code-block:: python
+    :caption: Python
+
+    def dirichlet_eval_metric(
+        pred: np.ndarray, dtrain: xgb.DMatrix
+    ) -> Tuple[str, float]:
+        Y = dtrain.get_label().reshape(pred.shape)
+        return "dirichlet_ll", dirichlet_fun(pred, Y)
+
+.. code-block:: r
+    :caption: R
+
+    dirichlet.eval.metric <- function(pred, dtrain) {
+        y <- getinfo(dtrain, "label")
+        ll <- dirichlet.fun(pred, y)
+        return(
+            list(
+                metric = "dirichlet_ll",
+                value = ll
+            )
+        )
+    }
+
+*****************
+Practical Example
+*****************
+
+A good source for test datasets for proportions data is the R package ``DirichletReg``:
+
+`<https://cran.r-project.org/package=DirichletReg>`_
+
+For this example, we'll now use the Arctic Lake dataset
+(Aitchison, J. (2003). The Statistical Analysis of Compositional Data. The Blackburn Press, Caldwell, NJ.),
+taken from the ``DirichletReg`` R package, which consists of 39 rows with one predictor variable 'depth'
+and a three-valued response variable denoting the sediment composition of the measurements in this arctic
+lake (sand, silt, clay).
+
+The data:
+
+.. code-block:: python
+    :caption: Python
+    
+    # depth
+    X = np.array([
+        10.4,11.7,12.8,13,15.7,16.3,18,18.7,20.7,22.1,
+        22.4,24.4,25.8,32.5,33.6,36.8,37.8,36.9,42.2,47,
+        47.1,48.4,49.4,49.5,59.2,60.1,61.7,62.4,69.3,73.6,
+        74.4,78.5,82.9,87.7,88.1,90.4,90.6,97.7,103.7,
+    ]).reshape((-1,1))
+    # sand, silt, clay
+    Y = np.array([
+        [0.775,0.195,0.03], [0.719,0.249,0.032], [0.507,0.361,0.132],
+        [0.522,0.409,0.066], [0.7,0.265,0.035], [0.665,0.322,0.013],
+        [0.431,0.553,0.016], [0.534,0.368,0.098], [0.155,0.544,0.301],
+        [0.317,0.415,0.268], [0.657,0.278,0.065], [0.704,0.29,0.006],
+        [0.174,0.536,0.29], [0.106,0.698,0.196], [0.382,0.431,0.187],
+        [0.108,0.527,0.365], [0.184,0.507,0.309], [0.046,0.474,0.48],
+        [0.156,0.504,0.34], [0.319,0.451,0.23], [0.095,0.535,0.37],
+        [0.171,0.48,0.349], [0.105,0.554,0.341], [0.048,0.547,0.41],
+        [0.026,0.452,0.522], [0.114,0.527,0.359], [0.067,0.469,0.464],
+        [0.069,0.497,0.434], [0.04,0.449,0.511], [0.074,0.516,0.409],
+        [0.048,0.495,0.457], [0.045,0.485,0.47], [0.066,0.521,0.413],
+        [0.067,0.473,0.459], [0.074,0.456,0.469], [0.06,0.489,0.451],
+        [0.063,0.538,0.399], [0.025,0.48,0.495], [0.02,0.478,0.502],
+    ])
+
+.. code-block:: r
+    :caption: R
+
+    data("ArcticLake", package="DirichletReg")
+    x <- ArcticLake[, c("depth"), drop=F]
+    y <- ArcticLake[, c("sand", "silt", "clay")] |> as.matrix()
+
+Fitting an XGBoost model and making predictions:
+
+.. code-block:: python
+    :caption: Python
+    
+    from typing import Dict, List
+    
+    dtrain = xgb.DMatrix(X, label=Y)
+    results: Dict[str, Dict[str, List[float]]] = {}
+    booster = xgb.train(
+        params={
+            "tree_method": "hist",
+            "num_target": Y.shape[1],
+            "base_score": 0,
+            "disable_default_eval_metric": True,
+            "max_depth": 3,
+            "seed": 123,
+        },
+        dtrain=dtrain,
+        num_boost_round=10,
+        obj=dirichlet_xgb_objective,
+        evals=[(dtrain, "Train")],
+        evals_result=results,
+        custom_metric=dirichlet_eval_metric,
+    )
+    yhat = softmax(booster.inplace_predict(X), axis=1)
+
+.. code-block:: r
+    :caption: R
+
+    dtrain <- xgb.DMatrix(x, y)
+    booster <- xgb.train(
+        params = list(
+            tree_method="hist",
+            num_target=ncol(y),
+            base_score=0,
+            disable_default_eval_metric=TRUE,
+            max_depth=3,
+            seed=123
+        ),
+        data = dtrain,
+        nrounds = 10,
+        obj = dirichlet.xgb.objective,
+        evals = list(Train=dtrain),
+        eval_metric = dirichlet.eval.metric
+    )
+    raw.pred <- predict(booster, x, reshape=TRUE)
+    yhat <- apply(raw.pred, 1, softmax) |> t()
+
+
+Should produce an evaluation log as follows (note: the function is decreasing as
+expected - but unlike other objectives, the minimum value here can reach below zero):
+
+.. code-block:: none
+
+    [0] Train-dirichlet_ll:-40.25009
+    [1] Train-dirichlet_ll:-47.69122
+    [2] Train-dirichlet_ll:-52.64620
+    [3] Train-dirichlet_ll:-56.36977
+    [4] Train-dirichlet_ll:-59.33048
+    [5] Train-dirichlet_ll:-61.93359
+    [6] Train-dirichlet_ll:-64.17280
+    [7] Train-dirichlet_ll:-66.29709
+    [8] Train-dirichlet_ll:-68.21001
+    [9] Train-dirichlet_ll:-70.03442
+
+One can confirm that the obtained ``yhat`` resembles the actual concentrations
+to a large degree, beyond what would be expected from random predictions by a
+simple look at both ``yhat`` and ``Y``.
+
+For better results, one might want to add an intercept. XGBoost only
+allows using scalars for intercepts, but for a vector-valued model,
+the optimal intercept should also have vector form.
+
+This can be done by supplying ``base_margin`` instead - unlike the
+intercept, one must specifically supply values for every row here,
+and said ``base_margin`` must be supplied again at the moment of making
+predictions (i.e. does not get added automatically like ``base_score``
+does).
+
+For the case of a Dirichlet model, the optimal intercept can be obtained
+efficiently using a general solver (e.g. SciPy's Newton solver) with
+dedicated likelihood, gradient and Hessian functions for just the intercept part.
+Further, note that if one frames it instead as bounded optimization without
+applying 'exp' transform to the concentrations, it becomes instead a convex
+problem, for which the true Hessian can be used without issues in other
+classes of solvers.
+
+For simplicity, this example will nevertheless reuse the same likelihood
+and gradient functions that were defined earlier alongside with SciPy's / R's
+L-BFGS solver to obtain the optimal vector-valued intercept:
+
+.. code-block:: python
+    :caption: Python
+
+    from scipy.optimize import minimize
+
+    def get_optimal_intercepts(Y: np.ndarray) -> np.ndarray:
+        k = Y.shape[1]
+        res = minimize(
+            fun=lambda pred: dirichlet_fun(
+                np.broadcast_to(pred, (Y.shape[0], k)),
+                Y
+            ),
+            x0=np.zeros(k),
+            jac=lambda pred: dirichlet_grad(
+                np.broadcast_to(pred, (Y.shape[0], k)),
+                Y
+            ).sum(axis=0)
+        )
+        return res["x"]
+    intercepts = get_optimal_intercepts(Y)
+
+.. code-block:: r
+    :caption: R
+
+    get.optimal.intercepts <- function(y) {
+        k <- ncol(y)
+        broadcast.vec <- function(x) rep(x, nrow(y)) |> matrix(ncol=k, byrow=T)
+        res <- optim(
+            par = numeric(k),
+            fn = function(x) dirichlet.fun(broadcast.vec(x), y),
+            gr = function(x) dirichlet.grad(broadcast.vec(x), y) |> colSums(),
+            method = "L-BFGS-B"
+        )
+        return(res$par)
+    }
+    intercepts <- get.optimal.intercepts(y)
+
+
+Now fitting a model again, this time with the intercept:
+
+.. code-block:: python
+    :caption: Python
+
+    base_margin = np.broadcast_to(intercepts, Y.shape)
+    dtrain_w_intercept = xgb.DMatrix(X, label=Y, base_margin=base_margin)
+    results: Dict[str, Dict[str, List[float]]] = {}
+    booster = xgb.train(
+        params={
+            "tree_method": "hist",
+            "num_target": Y.shape[1],
+            "base_score": 0,
+            "disable_default_eval_metric": True,
+            "max_depth": 3,
+            "seed": 123,
+        },
+        dtrain=dtrain_w_intercept,
+        num_boost_round=10,
+        obj=dirichlet_xgb_objective,
+        evals=[(dtrain, "Train")],
+        evals_result=results,
+        custom_metric=dirichlet_eval_metric,
+    )
+    yhat = softmax(
+        booster.predict(
+            xgb.DMatrix(X, base_margin=base_margin)
+        ),
+        axis=1
+    )
+
+.. code-block:: r
+    :caption: R
+
+    base.margin <- rep(intercepts, nrow(y)) |> matrix(nrow=nrow(y), byrow=T)
+    dtrain <- xgb.DMatrix(x, y, base_margin=base.margin)
+    booster <- xgb.train(
+        params = list(
+            tree_method="hist",
+            num_target=ncol(y),
+            base_score=0,
+            disable_default_eval_metric=TRUE,
+            max_depth=3,
+            seed=123
+        ),
+        data = dtrain,
+        nrounds = 10,
+        obj = dirichlet.xgb.objective,
+        evals = list(Train=dtrain),
+        eval_metric = dirichlet.eval.metric
+    )
+    raw.pred <- predict(
+        booster,
+        x,
+        base_margin=base.margin,
+        reshape=TRUE
+    )
+    yhat <- apply(raw.pred, 1, softmax) |> t()
+
+.. code-block:: none
+
+    [0] Train-dirichlet_ll:-37.01861
+    [1] Train-dirichlet_ll:-42.86120
+    [2] Train-dirichlet_ll:-46.55133
+    [3] Train-dirichlet_ll:-49.15111
+    [4] Train-dirichlet_ll:-51.02638
+    [5] Train-dirichlet_ll:-52.53880
+    [6] Train-dirichlet_ll:-53.77409
+    [7] Train-dirichlet_ll:-54.88851
+    [8] Train-dirichlet_ll:-55.95961
+    [9] Train-dirichlet_ll:-56.95497
+
+For this small example problem, predictions should be very similar between the
+two and the version without intercepts achieved a lower objective function in the
+training data (for the Python version at least), but for more serious usage with
+real-world data, one is likely to observe better results when adding the intercepts.
diff --git a/doc/tutorials/custom_metric_obj.rst b/doc/tutorials/custom_metric_obj.rst
index 36bd0c8d65d5..51491e85c656 100644
--- a/doc/tutorials/custom_metric_obj.rst
+++ b/doc/tutorials/custom_metric_obj.rst
@@ -15,7 +15,7 @@ Overview
 XGBoost is designed to be an extensible library.  One way to extend it is by providing our
 own objective function for training and corresponding metric for performance monitoring.
 This document introduces implementing a customized elementwise evaluation metric and
-objective for XGBoost.  Although the introduction uses Python for demonstration, the
+objective for XGBoost. Although the introduction uses Python for demonstration, the
 concepts should be readily applicable to other language bindings.
 
 .. note::
@@ -23,6 +23,9 @@ concepts should be readily applicable to other language bindings.
    * The ranking task does not support customized functions.
    * Breaking change was made in XGBoost 1.6.
 
+See also the advanced usage example for more information about limitations and
+workarounds for more complex objetives: :doc:`/tutorials/advanced_custom_obj`
+
 In the following two sections, we will provide a step by step walk through of implementing
 the ``Squared Log Error (SLE)`` objective function:
 
diff --git a/doc/tutorials/index.rst b/doc/tutorials/index.rst
index c82abf43f452..eca01e1ddeb4 100644
--- a/doc/tutorials/index.rst
+++ b/doc/tutorials/index.rst
@@ -30,5 +30,6 @@ See `Awesome XGBoost <https://github.com/dmlc/xgboost/tree/master/demo>`_ for mo
   input_format
   param_tuning
   custom_metric_obj
+  advanced_custom_obj
   intercept
   privacy_preserving
\ No newline at end of file
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 560a3a8ed285..6c19a6205e7d 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -517,6 +517,11 @@ def task(i: int) -> float:
                 The value of the gradient for each sample point.
             hess: array_like of shape [n_samples]
                 The value of the second derivative for each sample point
+
+            Note that, if the custom objective produces negative values for
+            the Hessian, these will be clipped. If the objective is non-convex,
+            one might also consider using the expected Hessian (Fisher
+            information) instead.
 """
 
 

From 3ec74a1ba920c5307c2b597891badf81d2be04b4 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 10 Jul 2024 04:05:20 +0800
Subject: [PATCH 13/51] [doc] Add `build_info` to autodoc. [skip ci] (#10551)

---
 doc/python/python_api.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst
index 4ba520fe46eb..86da4fda0cfc 100644
--- a/doc/python/python_api.rst
+++ b/doc/python/python_api.rst
@@ -14,6 +14,8 @@ Global Configuration
 
 .. autofunction:: xgboost.get_config
 
+.. autofunction:: xgboost.build_info
+
 Core Data Structure
 -------------------
 .. automodule:: xgboost.core

From 8e2b874b4cab23db4f290863b252ec4ede504527 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 10 Jul 2024 13:00:57 +0800
Subject: [PATCH 14/51] [doc] Add notes about RMM and device ordinal. [skip ci]
 (#10562)

- Remove the experimental tag, we have been running it for a long time now.
- Add notes about avoiding set CUDA device.
- Add link in parameter.
---
 demo/rmm_plugin/README.rst | 18 ++++++++++++++----
 doc/parameter.rst          |  6 +++++-
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/demo/rmm_plugin/README.rst b/demo/rmm_plugin/README.rst
index 4742507d240d..28b816eb2574 100644
--- a/demo/rmm_plugin/README.rst
+++ b/demo/rmm_plugin/README.rst
@@ -1,5 +1,5 @@
-Using XGBoost with RAPIDS Memory Manager (RMM) plugin (EXPERIMENTAL)
-====================================================================
+Using XGBoost with RAPIDS Memory Manager (RMM) plugin
+=====================================================
 
 `RAPIDS Memory Manager (RMM) <https://github.com/rapidsai/rmm>`__ library provides a
 collection of efficient memory allocators for NVIDIA GPUs. It is now possible to use
@@ -47,5 +47,15 @@ the global configuration ``use_rmm``:
   with xgb.config_context(use_rmm=True):
     clf = xgb.XGBClassifier(tree_method="hist", device="cuda")
 
-Depending on the choice of memory pool size or type of allocator, this may have negative
-performance impact.
+Depending on the choice of memory pool size and the type of the allocator, this can add
+more consistency to memory usage but with slightly degraded performance impact.
+
+*******************************
+No Device Ordinal for Multi-GPU
+*******************************
+
+Since with RMM the memory pool is pre-allocated on a specific device, changing the CUDA
+device ordinal in XGBoost can result in memory error ``cudaErrorIllegalAddress``. Use the
+``CUDA_VISIBLE_DEVICES`` environment variable instead of the ``device="cuda:1"`` parameter
+for selecting device. For distributed training, the distributed computing frameworks like
+``dask-cuda`` are responsible for device management.
\ No newline at end of file
diff --git a/doc/parameter.rst b/doc/parameter.rst
index 00f0eaea6193..a776559223f4 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -25,7 +25,11 @@ Global Configuration
 The following parameters can be set in the global scope, using :py:func:`xgboost.config_context()` (Python) or ``xgb.set.config()`` (R).
 
 * ``verbosity``: Verbosity of printing messages. Valid values of 0 (silent), 1 (warning), 2 (info), and 3 (debug).
-* ``use_rmm``: Whether to use RAPIDS Memory Manager (RMM) to allocate GPU memory. This option is only applicable when XGBoost is built (compiled) with the RMM plugin enabled. Valid values are ``true`` and ``false``.
+
+* ``use_rmm``: Whether to use RAPIDS Memory Manager (RMM) to allocate cache GPU
+  memory. The primary memory is always allocated on the RMM pool when XGBoost is built
+  (compiled) with the RMM plugin enabled. Valid values are ``true`` and ``false``. See
+  :doc:`/python/rmm-examples/index` for details.
 
 ******************
 General Parameters

From baba3e9eb045bdcdddd2ce46951a2e3831239d95 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 10 Jul 2024 13:01:47 +0800
Subject: [PATCH 15/51] Fix empty partition. (#10559)

---
 src/common/hist_util.cc | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index 9b703a3fa13a..7107cb2dee7f 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -201,7 +201,7 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
 
   auto const &row_ptr = gmat.row_ptr.data();
   auto base_rowid = gmat.base_rowid;
-  uint32_t const *offsets = gmat.index.Offset();
+  std::uint32_t const *offsets = gmat.index.Offset();
   // There's no feature-based compression if missing value is present.
   if (kAnyMissing) {
     CHECK(!offsets);
@@ -212,8 +212,11 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
   auto get_row_ptr = [&](bst_idx_t ridx) {
     return kFirstPage ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
   };
-  auto get_rid = [&](bst_idx_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };
+  auto get_rid = [&](bst_idx_t ridx) {
+    return kFirstPage ? ridx : (ridx - base_rowid);
+  };
 
+  CHECK_NE(row_indices.Size(), 0);
   const size_t n_features =
       get_row_ptr(row_indices.begin[0] + 1) - get_row_ptr(row_indices.begin[0]);
   auto hist_data = reinterpret_cast<double *>(hist.data());
@@ -325,16 +328,20 @@ void BuildHistDispatch(Span<GradientPair const> gpair, const RowSetCollection::E
 
     if (contiguousBlock) {
       // contiguous memory access, built-in HW prefetching is enough
+      if (row_indices.Size() == 0) {
+        return;
+      }
       RowsWiseBuildHistKernel<false, BuildingManager>(gpair, row_indices, gmat, hist);
     } else {
-      const RowSetCollection::Elem span1(row_indices.begin,
-                                        row_indices.end - no_prefetch_size);
-      const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size,
-                                        row_indices.end);
-
-      RowsWiseBuildHistKernel<true, BuildingManager>(gpair, span1, gmat, hist);
+      const RowSetCollection::Elem span1(row_indices.begin, row_indices.end - no_prefetch_size);
+      if (span1.Size() != 0) {
+        RowsWiseBuildHistKernel<true, BuildingManager>(gpair, span1, gmat, hist);
+      }
       // no prefetching to avoid loading extra memory
-      RowsWiseBuildHistKernel<false, BuildingManager>(gpair, span2, gmat, hist);
+      const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size, row_indices.end);
+      if (span2.Size() != 0) {
+        RowsWiseBuildHistKernel<false, BuildingManager>(gpair, span2, gmat, hist);
+      }
     }
   }
 }

From 34b154c28460836013da822ddcbafaace04eaf27 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 11 Jul 2024 00:43:08 +0800
Subject: [PATCH 16/51] Avoid the use of size_t in the partitioner. (#10541)

- Avoid the use of size_t in the partitioner.
- Use `Span` instead of `Elem` where `node_id` is not needed.
- Remove the `const_cast`.
- Make sure the constness is not removed in the `Elem` by making it reference only.

size_t is implementation-defined, which causes issue when we want to pass pointer or span.
---
 src/common/hist_util.cc                     | 48 +++++------
 src/common/hist_util.h                      |  2 +-
 src/common/partition_builder.h              | 92 ++++++++++-----------
 src/common/row_set.h                        | 60 +++++++++-----
 src/tree/common_row_partitioner.h           | 27 +++---
 src/tree/hist/evaluate_splits.h             |  5 +-
 src/tree/hist/histogram.h                   |  8 +-
 tests/cpp/common/test_partition_builder.cc  |  4 +-
 tests/cpp/tree/hist/test_evaluate_splits.cc |  6 +-
 tests/cpp/tree/hist/test_histogram.cc       |  7 +-
 tests/cpp/tree/test_approx.cc               | 64 +++++++-------
 tests/cpp/tree/test_quantile_hist.cc        | 61 +++++++-------
 12 files changed, 203 insertions(+), 181 deletions(-)

diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index 7107cb2dee7f..dfd80cb68c13 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -187,15 +187,14 @@ class GHistBuildingManager {
 };
 
 template <bool do_prefetch, class BuildingManager>
-void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
-                             const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
-                             GHistRow hist) {
+void RowsWiseBuildHistKernel(Span<GradientPair const> gpair, Span<bst_idx_t const> row_indices,
+                             const GHistIndexMatrix &gmat, GHistRow hist) {
   constexpr bool kAnyMissing = BuildingManager::kAnyMissing;
   constexpr bool kFirstPage = BuildingManager::kFirstPage;
   using BinIdxType = typename BuildingManager::BinIdxType;
 
-  const size_t size = row_indices.Size();
-  const size_t *rid = row_indices.begin;
+  const size_t size = row_indices.size();
+  bst_idx_t const *rid = row_indices.data();
   auto const *p_gpair = reinterpret_cast<const float *>(gpair.data());
   const BinIdxType *gradient_index = gmat.index.data<BinIdxType>();
 
@@ -216,9 +215,9 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
     return kFirstPage ? ridx : (ridx - base_rowid);
   };
 
-  CHECK_NE(row_indices.Size(), 0);
+  CHECK_NE(row_indices.size(), 0);
   const size_t n_features =
-      get_row_ptr(row_indices.begin[0] + 1) - get_row_ptr(row_indices.begin[0]);
+      get_row_ptr(row_indices.data()[0] + 1) - get_row_ptr(row_indices.data()[0]);
   auto hist_data = reinterpret_cast<double *>(hist.data());
   const uint32_t two{2};  // Each element from 'gpair' and 'hist' contains
                           // 2 FP values: gradient and hessian.
@@ -264,14 +263,13 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
 }
 
 template <class BuildingManager>
-void ColsWiseBuildHistKernel(Span<GradientPair const> gpair,
-                             const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
-                             GHistRow hist) {
+void ColsWiseBuildHistKernel(Span<GradientPair const> gpair, Span<bst_idx_t const> row_indices,
+                             const GHistIndexMatrix &gmat, GHistRow hist) {
   constexpr bool kAnyMissing = BuildingManager::kAnyMissing;
   constexpr bool kFirstPage = BuildingManager::kFirstPage;
   using BinIdxType = typename BuildingManager::BinIdxType;
-  const size_t size = row_indices.Size();
-  const size_t *rid = row_indices.begin;
+  const size_t size = row_indices.size();
+  bst_idx_t const *rid = row_indices.data();
   auto const *pgh = reinterpret_cast<const float *>(gpair.data());
   const BinIdxType *gradient_index = gmat.index.data<BinIdxType>();
 
@@ -315,31 +313,31 @@ void ColsWiseBuildHistKernel(Span<GradientPair const> gpair,
 }
 
 template <class BuildingManager>
-void BuildHistDispatch(Span<GradientPair const> gpair, const RowSetCollection::Elem row_indices,
+void BuildHistDispatch(Span<GradientPair const> gpair, Span<bst_idx_t const> row_indices,
                        const GHistIndexMatrix &gmat, GHistRow hist) {
   if (BuildingManager::kReadByColumn) {
     ColsWiseBuildHistKernel<BuildingManager>(gpair, row_indices, gmat, hist);
   } else {
-    const size_t nrows = row_indices.Size();
+    const size_t nrows = row_indices.size();
     const size_t no_prefetch_size = Prefetch::NoPrefetchSize(nrows);
     // if need to work with all rows from bin-matrix (e.g. root node)
     const bool contiguousBlock =
-        (row_indices.begin[nrows - 1] - row_indices.begin[0]) == (nrows - 1);
+        (row_indices.begin()[nrows - 1] - row_indices.begin()[0]) == (nrows - 1);
 
     if (contiguousBlock) {
-      // contiguous memory access, built-in HW prefetching is enough
-      if (row_indices.Size() == 0) {
+      if (row_indices.empty()) {
         return;
       }
+      // contiguous memory access, built-in HW prefetching is enough
       RowsWiseBuildHistKernel<false, BuildingManager>(gpair, row_indices, gmat, hist);
     } else {
-      const RowSetCollection::Elem span1(row_indices.begin, row_indices.end - no_prefetch_size);
-      if (span1.Size() != 0) {
+      auto span1 = row_indices.subspan(0, row_indices.size() - no_prefetch_size);
+      if (!span1.empty()) {
         RowsWiseBuildHistKernel<true, BuildingManager>(gpair, span1, gmat, hist);
       }
       // no prefetching to avoid loading extra memory
-      const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size, row_indices.end);
-      if (span2.Size() != 0) {
+      auto span2 = row_indices.subspan(row_indices.size() - no_prefetch_size);
+      if (!span2.empty()) {
         RowsWiseBuildHistKernel<false, BuildingManager>(gpair, span2, gmat, hist);
       }
     }
@@ -347,7 +345,7 @@ void BuildHistDispatch(Span<GradientPair const> gpair, const RowSetCollection::E
 }
 
 template <bool any_missing>
-void BuildHist(Span<GradientPair const> gpair, const RowSetCollection::Elem row_indices,
+void BuildHist(Span<GradientPair const> gpair, Span<bst_idx_t const> row_indices,
                const GHistIndexMatrix &gmat, GHistRow hist, bool force_read_by_column) {
   /* force_read_by_column is used for testing the columnwise building of histograms.
    * default force_read_by_column = false
@@ -365,13 +363,11 @@ void BuildHist(Span<GradientPair const> gpair, const RowSetCollection::Elem row_
       });
 }
 
-template void BuildHist<true>(Span<GradientPair const> gpair,
-                              const RowSetCollection::Elem row_indices,
+template void BuildHist<true>(Span<GradientPair const> gpair, Span<bst_idx_t const> row_indices,
                               const GHistIndexMatrix &gmat, GHistRow hist,
                               bool force_read_by_column);
 
-template void BuildHist<false>(Span<GradientPair const> gpair,
-                               const RowSetCollection::Elem row_indices,
+template void BuildHist<false>(Span<GradientPair const> gpair, Span<bst_idx_t const> row_indices,
                                const GHistIndexMatrix &gmat, GHistRow hist,
                                bool force_read_by_column);
 }  // namespace xgboost::common
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index 2e24f68ffb0a..559093bb5a3f 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -635,7 +635,7 @@ class ParallelGHistBuilder {
 
 // construct a histogram via histogram aggregation
 template <bool any_missing>
-void BuildHist(Span<GradientPair const> gpair, const RowSetCollection::Elem row_indices,
+void BuildHist(Span<GradientPair const> gpair, Span<bst_idx_t const> row_indices,
                const GHistIndexMatrix& gmat, GHistRow hist, bool force_read_by_column = false);
 }  // namespace common
 }  // namespace xgboost
diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h
index 53e11bd91d1a..98c876e849a0 100644
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by Contributors
+ * Copyright 2021-2024, XGBoost Contributors
  * \file row_set.h
  * \brief Quick Utility to compute subset of rows
  * \author Philip Cho, Tianqi Chen
@@ -16,7 +16,6 @@
 #include <utility>
 #include <vector>
 
-#include "../tree/hist/expand_entry.h"
 #include "categorical.h"
 #include "column_matrix.h"
 #include "xgboost/context.h"
@@ -54,23 +53,23 @@ class PartitionBuilder {
   // Handle dense columns
   // Analog of std::stable_partition, but in no-inplace manner
   template <bool default_left, bool any_missing, typename ColumnType, typename Predicate>
-  inline std::pair<size_t, size_t> PartitionKernel(ColumnType* p_column,
-                                                   common::Span<const size_t> row_indices,
-                                                   common::Span<size_t> left_part,
-                                                   common::Span<size_t> right_part,
-                                                   size_t base_rowid, Predicate&& pred) {
+  std::pair<size_t, size_t> PartitionKernel(ColumnType* p_column,
+                                            common::Span<const bst_idx_t> row_indices,
+                                            common::Span<bst_idx_t> left_part,
+                                            common::Span<bst_idx_t> right_part,
+                                            bst_idx_t base_rowid, Predicate&& pred) {
     auto& column = *p_column;
-    size_t* p_left_part = left_part.data();
-    size_t* p_right_part = right_part.data();
-    size_t nleft_elems = 0;
-    size_t nright_elems = 0;
+    bst_idx_t* p_left_part = left_part.data();
+    bst_idx_t* p_right_part = right_part.data();
+    bst_idx_t nleft_elems = 0;
+    bst_idx_t nright_elems = 0;
 
     auto p_row_indices = row_indices.data();
     auto n_samples = row_indices.size();
 
     for (size_t i = 0; i < n_samples; ++i) {
       auto rid = p_row_indices[i];
-      const int32_t bin_id = column[rid - base_rowid];
+      bst_bin_t const bin_id = column[rid - base_rowid];
       if (any_missing && bin_id == ColumnType::kMissingId) {
         if (default_left) {
           p_left_part[nleft_elems++] = rid;
@@ -90,14 +89,14 @@ class PartitionBuilder {
   }
 
   template <typename Pred>
-  inline std::pair<size_t, size_t> PartitionRangeKernel(common::Span<const size_t> ridx,
-                                                        common::Span<size_t> left_part,
-                                                        common::Span<size_t> right_part,
+  inline std::pair<size_t, size_t> PartitionRangeKernel(common::Span<const bst_idx_t> ridx,
+                                                        common::Span<bst_idx_t> left_part,
+                                                        common::Span<bst_idx_t> right_part,
                                                         Pred pred) {
-    size_t* p_left_part = left_part.data();
-    size_t* p_right_part = right_part.data();
-    size_t nleft_elems = 0;
-    size_t nright_elems = 0;
+    bst_idx_t* p_left_part = left_part.data();
+    bst_idx_t* p_right_part = right_part.data();
+    bst_idx_t nleft_elems = 0;
+    bst_idx_t nright_elems = 0;
     for (auto row_id : ridx) {
       if (pred(row_id)) {
         p_left_part[nleft_elems++] = row_id;
@@ -112,10 +111,10 @@ class PartitionBuilder {
   void Partition(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
                  const common::Range1d range, const bst_bin_t split_cond,
                  GHistIndexMatrix const& gmat, const common::ColumnMatrix& column_matrix,
-                 const RegTree& tree, const size_t* rid) {
-    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
-    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
-    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
+                 const RegTree& tree, bst_idx_t const* rid) {
+    common::Span<bst_idx_t const> rid_span{rid + range.begin(), rid + range.end()};
+    common::Span<bst_idx_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
+    common::Span<bst_idx_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
     std::size_t nid = nodes[node_in_set].nid;
     bst_feature_t fid = tree.SplitIndex(nid);
     bool default_left = tree.DefaultLeft(nid);
@@ -184,8 +183,9 @@ class PartitionBuilder {
   }
 
   template <bool any_missing, typename ColumnType, typename Predicate>
-  void MaskKernel(ColumnType* p_column, common::Span<const size_t> row_indices, size_t base_rowid,
-                  BitVector* decision_bits, BitVector* missing_bits, Predicate&& pred) {
+  void MaskKernel(ColumnType* p_column, common::Span<bst_idx_t const> row_indices,
+                  bst_idx_t base_rowid, BitVector* decision_bits, BitVector* missing_bits,
+                  Predicate&& pred) {
     auto& column = *p_column;
     for (auto const row_id : row_indices) {
       auto const bin_id = column[row_id - base_rowid];
@@ -205,9 +205,9 @@ class PartitionBuilder {
   template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
   void MaskRows(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
                 const common::Range1d range, bst_bin_t split_cond, GHistIndexMatrix const& gmat,
-                const common::ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid,
-                BitVector* decision_bits, BitVector* missing_bits) {
-    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+                const common::ColumnMatrix& column_matrix, const RegTree& tree,
+                bst_idx_t const* rid, BitVector* decision_bits, BitVector* missing_bits) {
+    common::Span<bst_idx_t const> rid_span{rid + range.begin(), rid + range.end()};
     std::size_t nid = nodes[node_in_set].nid;
     bst_feature_t fid = tree.SplitIndex(nid);
     bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
@@ -263,11 +263,11 @@ class PartitionBuilder {
   template <typename ExpandEntry>
   void PartitionByMask(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
                        const common::Range1d range, GHistIndexMatrix const& gmat,
-                       const RegTree& tree, const size_t* rid, BitVector const& decision_bits,
+                       const RegTree& tree, bst_idx_t const* rid, BitVector const& decision_bits,
                        BitVector const& missing_bits) {
-    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
-    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
-    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
+    common::Span<bst_idx_t const> rid_span(rid + range.begin(), rid + range.end());
+    common::Span<bst_idx_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
+    common::Span<bst_idx_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
     std::size_t nid = nodes[node_in_set].nid;
     bool default_left = tree.DefaultLeft(nid);
 
@@ -299,12 +299,12 @@ class PartitionBuilder {
     }
   }
 
-  common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
+  common::Span<bst_idx_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
     const size_t task_idx = GetTaskIdx(nid, begin);
     return { mem_blocks_.at(task_idx)->Left(), end - begin };
   }
 
-  common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
+  common::Span<bst_idx_t> GetRightBuffer(int nid, size_t begin, size_t end) {
     const size_t task_idx = GetTaskIdx(nid, begin);
     return { mem_blocks_.at(task_idx)->Right(), end - begin };
   }
@@ -346,14 +346,14 @@ class PartitionBuilder {
     }
   }
 
-  void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
+  void MergeToArray(bst_node_t nid, size_t begin, bst_idx_t* rows_indexes) {
     size_t task_idx = GetTaskIdx(nid, begin);
 
-    size_t* left_result  = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
-    size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
+    bst_idx_t* left_result = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
+    bst_idx_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
 
-    const size_t* left = mem_blocks_[task_idx]->Left();
-    const size_t* right = mem_blocks_[task_idx]->Right();
+    bst_idx_t const* left = mem_blocks_[task_idx]->Left();
+    bst_idx_t const* right = mem_blocks_[task_idx]->Right();
 
     std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result);
     std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result);
@@ -377,10 +377,10 @@ class PartitionBuilder {
         return;
       }
       CHECK(tree.IsLeaf(node.node_id));
-      if (node.begin) {  // guard for empty node.
-        size_t ptr_offset = node.end - p_begin;
+      if (node.begin()) {  // guard for empty node.
+        size_t ptr_offset = node.end() - p_begin;
         CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
-        for (auto idx = node.begin; idx != node.end; ++idx) {
+        for (auto idx = node.begin(); idx != node.end(); ++idx) {
           h_pos[*idx] = sampledp(*idx) ? ~node.node_id : node.node_id;
         }
       }
@@ -395,16 +395,16 @@ class PartitionBuilder {
     size_t n_offset_left;
     size_t n_offset_right;
 
-    size_t* Left() {
+    bst_idx_t* Left() {
       return &left_data_[0];
     }
 
-    size_t* Right() {
+    bst_idx_t* Right() {
       return &right_data_[0];
     }
    private:
-    size_t left_data_[BlockSize];
-    size_t right_data_[BlockSize];
+    bst_idx_t left_data_[BlockSize];
+    bst_idx_t right_data_[BlockSize];
   };
   std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
   std::vector<size_t> blocks_offsets_;
diff --git a/src/common/row_set.h b/src/common/row_set.h
index acb39730539b..8df2a7a36839 100644
--- a/src/common/row_set.h
+++ b/src/common/row_set.h
@@ -31,15 +31,29 @@ class RowSetCollection {
    *        associated with a particular node in a decision tree.
    */
   struct Elem {
-    std::size_t const* begin{nullptr};
-    std::size_t const* end{nullptr};
+   private:
+    bst_idx_t* begin_{nullptr};
+    bst_idx_t* end_{nullptr};
+
+   public:
     bst_node_t node_id{-1};
     // id of node associated with this instance set; -1 means uninitialized
     Elem() = default;
-    Elem(std::size_t const* begin, std::size_t const* end, bst_node_t node_id = -1)
-        : begin(begin), end(end), node_id(node_id) {}
+    Elem(bst_idx_t* begin, bst_idx_t* end, bst_node_t node_id = -1)
+        : begin_(begin), end_(end), node_id(node_id) {}
+
+    // Disable copy ctor to avoid casting away the constness via copy.
+    Elem(Elem const& that) = delete;
+    Elem& operator=(Elem const& that) = delete;
+    Elem(Elem&& that) = default;
+    Elem& operator=(Elem&& that) = default;
 
-    std::size_t Size() const { return end - begin; }
+    [[nodiscard]] std::size_t Size() const { return std::distance(begin(), end()); }
+
+    [[nodiscard]] bst_idx_t const* begin() const { return this->begin_; }  // NOLINT
+    [[nodiscard]] bst_idx_t const* end() const { return this->end_; }      // NOLINT
+    [[nodiscard]] bst_idx_t* begin() { return this->begin_; }              // NOLINT
+    [[nodiscard]] bst_idx_t* end() { return this->end_; }                  // NOLINT
   };
 
   [[nodiscard]] std::vector<Elem>::const_iterator begin() const {  // NOLINT
@@ -71,55 +85,57 @@ class RowSetCollection {
     CHECK(elem_of_each_node_.empty());
 
     if (row_indices_.empty()) {  // edge case: empty instance set
-      constexpr std::size_t* kBegin = nullptr;
-      constexpr std::size_t* kEnd = nullptr;
+      constexpr bst_idx_t* kBegin = nullptr;
+      constexpr bst_idx_t* kEnd = nullptr;
       static_assert(kEnd - kBegin == 0);
       elem_of_each_node_.emplace_back(kBegin, kEnd, 0);
       return;
     }
 
-    const std::size_t* begin = dmlc::BeginPtr(row_indices_);
-    const std::size_t* end = dmlc::BeginPtr(row_indices_) + row_indices_.size();
+    bst_idx_t* begin = row_indices_.data();
+    bst_idx_t* end = row_indices_.data() + row_indices_.size();
     elem_of_each_node_.emplace_back(begin, end, 0);
   }
 
-  [[nodiscard]] std::vector<std::size_t>* Data() { return &row_indices_; }
-  [[nodiscard]] std::vector<std::size_t> const* Data() const { return &row_indices_; }
+  [[nodiscard]] std::vector<bst_idx_t>* Data() { return &row_indices_; }
+  [[nodiscard]] std::vector<bst_idx_t> const* Data() const { return &row_indices_; }
 
   // split rowset into two
   void AddSplit(bst_node_t node_id, bst_node_t left_node_id, bst_node_t right_node_id,
                 bst_idx_t n_left, bst_idx_t n_right) {
-    const Elem e = elem_of_each_node_[node_id];
+    Elem& e = elem_of_each_node_[node_id];
 
-    std::size_t* all_begin{nullptr};
-    std::size_t* begin{nullptr};
-    if (e.begin == nullptr) {
+    bst_idx_t* all_begin{nullptr};
+    bst_idx_t* begin{nullptr};
+    bst_idx_t* end{nullptr};
+    if (e.begin() == nullptr) {
       CHECK_EQ(n_left, 0);
       CHECK_EQ(n_right, 0);
     } else {
       all_begin = row_indices_.data();
-      begin = all_begin + (e.begin - all_begin);
+      begin = all_begin + (e.begin() - all_begin);
+      end = elem_of_each_node_[node_id].end();
     }
 
     CHECK_EQ(n_left + n_right, e.Size());
-    CHECK_LE(begin + n_left, e.end);
-    CHECK_EQ(begin + n_left + n_right, e.end);
+    CHECK_LE(begin + n_left, e.end());
+    CHECK_EQ(begin + n_left + n_right, e.end());
 
     if (left_node_id >= static_cast<bst_node_t>(elem_of_each_node_.size())) {
-      elem_of_each_node_.resize(left_node_id + 1, Elem{nullptr, nullptr, -1});
+      elem_of_each_node_.resize(left_node_id + 1);
     }
     if (right_node_id >= static_cast<bst_node_t>(elem_of_each_node_.size())) {
-      elem_of_each_node_.resize(right_node_id + 1, Elem{nullptr, nullptr, -1});
+      elem_of_each_node_.resize(right_node_id + 1);
     }
 
     elem_of_each_node_[left_node_id] = Elem{begin, begin + n_left, left_node_id};
-    elem_of_each_node_[right_node_id] = Elem{begin + n_left, e.end, right_node_id};
+    elem_of_each_node_[right_node_id] = Elem{begin + n_left, end, right_node_id};
     elem_of_each_node_[node_id] = Elem{nullptr, nullptr, -1};
   }
 
  private:
   // stores the row indexes in the set
-  std::vector<std::size_t> row_indices_;
+  std::vector<bst_idx_t> row_indices_;
   // vector: node_id -> elements
   std::vector<Elem> elem_of_each_node_;
 };
diff --git a/src/tree/common_row_partitioner.h b/src/tree/common_row_partitioner.h
index c3065ad5f135..cd267673b66c 100644
--- a/src/tree/common_row_partitioner.h
+++ b/src/tree/common_row_partitioner.h
@@ -7,7 +7,7 @@
 #define XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_
 
 #include <algorithm>  // for all_of, fill
-#include <cinttypes>  // for uint32_t
+#include <cstdint>    // for uint32_t, int32_t
 #include <limits>     // for numeric_limits
 #include <vector>     // for vector
 
@@ -18,7 +18,7 @@
 #include "../common/partition_builder.h"  // for PartitionBuilder
 #include "../common/row_set.h"            // for RowSetCollection
 #include "../common/threading_utils.h"    // for ParallelFor2d
-#include "xgboost/base.h"                 // for bst_row_t
+#include "xgboost/base.h"                 // for bst_idx_t
 #include "xgboost/collective/result.h"    // for Success, SafeColl
 #include "xgboost/context.h"              // for Context
 #include "xgboost/linalg.h"               // for TensorView
@@ -46,7 +46,7 @@ class ColumnSplitHelper {
   void Partition(Context const* ctx, common::BlockedSpace2d const& space, std::int32_t n_threads,
                  GHistIndexMatrix const& gmat, common::ColumnMatrix const& column_matrix,
                  std::vector<ExpandEntry> const& nodes,
-                 std::vector<int32_t> const& split_conditions, RegTree const* p_tree) {
+                 std::vector<std::int32_t> const& split_conditions, RegTree const* p_tree) {
     // When data is split by column, we don't have all the feature values in the local worker, so
     // we first collect all the decisions and whether the feature is missing into bit vectors.
     std::fill(decision_storage_.begin(), decision_storage_.end(), 0);
@@ -56,7 +56,7 @@ class ColumnSplitHelper {
       bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0;
       partition_builder_->MaskRows<BinIdxType, any_missing, any_cat>(
           node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree,
-          (*row_set_collection_)[nid].begin, &decision_bits_, &missing_bits_);
+          (*row_set_collection_)[nid].begin(), &decision_bits_, &missing_bits_);
     });
 
     // Then aggregate the bit vectors across all the workers.
@@ -74,7 +74,7 @@ class ColumnSplitHelper {
       const size_t task_id = partition_builder_->GetTaskIdx(node_in_set, begin);
       partition_builder_->AllocateForTask(task_id);
       partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, *p_tree,
-                                          (*row_set_collection_)[nid].begin, decision_bits_,
+                                          (*row_set_collection_)[nid].begin(), decision_bits_,
                                           missing_bits_);
     });
   }
@@ -98,10 +98,10 @@ class CommonRowPartitioner {
                        bool is_col_split)
       : base_rowid{_base_rowid}, is_col_split_{is_col_split} {
     row_set_collection_.Clear();
-    std::vector<size_t>& row_indices = *row_set_collection_.Data();
+    std::vector<bst_idx_t>& row_indices = *row_set_collection_.Data();
     row_indices.resize(num_row);
 
-    std::size_t* p_row_indices = row_indices.data();
+    bst_idx_t* p_row_indices = row_indices.data();
     common::Iota(ctx, p_row_indices, p_row_indices + row_indices.size(), base_rowid);
     row_set_collection_.Init();
 
@@ -112,7 +112,7 @@ class CommonRowPartitioner {
 
   template <typename ExpandEntry>
   void FindSplitConditions(const std::vector<ExpandEntry>& nodes, const RegTree& tree,
-                           const GHistIndexMatrix& gmat, std::vector<int32_t>* split_conditions) {
+                           const GHistIndexMatrix& gmat, std::vector<bst_bin_t>* split_conditions) {
     auto const& ptrs = gmat.cut.Ptrs();
     auto const& vals = gmat.cut.Values();
 
@@ -197,7 +197,7 @@ class CommonRowPartitioner {
     // 1. Find split condition for each split
     size_t n_nodes = nodes.size();
 
-    std::vector<int32_t> split_conditions;
+    std::vector<bst_bin_t> split_conditions;
     if (column_matrix.IsInitialized()) {
       split_conditions.resize(n_nodes);
       FindSplitConditions(nodes, *p_tree, gmat, &split_conditions);
@@ -206,8 +206,8 @@ class CommonRowPartitioner {
     // 2.1 Create a blocked space of size SUM(samples in each node)
     common::BlockedSpace2d space(
         n_nodes,
-        [&](size_t node_in_set) {
-          int32_t nid = nodes[node_in_set].nid;
+        [&](std::size_t node_in_set) {
+          auto nid = nodes[node_in_set].nid;
           return row_set_collection_[nid].Size();
         },
         kPartitionBlockSize);
@@ -236,7 +236,7 @@ class CommonRowPartitioner {
         bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0;
         partition_builder_.template Partition<BinIdxType, any_missing, any_cat>(
             node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree,
-            row_set_collection_[nid].begin);
+            row_set_collection_[nid].begin());
       });
     }
 
@@ -248,8 +248,7 @@ class CommonRowPartitioner {
     // with updated row-indexes for each tree-node
     common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) {
       const int32_t nid = nodes[node_in_set].nid;
-      partition_builder_.MergeToArray(node_in_set, r.begin(),
-                                      const_cast<size_t*>(row_set_collection_[nid].begin));
+      partition_builder_.MergeToArray(node_in_set, r.begin(), row_set_collection_[nid].begin());
     });
 
     // 5. Add info about splits into row_set_collection_
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index ba673d85fc60..a260784ad1db 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -739,7 +739,7 @@ void UpdatePredictionCacheImpl(Context const *ctx, RegTree const *p_last_tree,
       if (!tree[nidx].IsDeleted() && tree[nidx].IsLeaf()) {
         auto const &rowset = part[nidx];
         auto leaf_value = tree[nidx].LeafValue();
-        for (const size_t *it = rowset.begin + r.begin(); it < rowset.begin + r.end(); ++it) {
+        for (auto const *it = rowset.begin() + r.begin(); it < rowset.begin() + r.end(); ++it) {
           out_preds(*it) += leaf_value;
         }
       }
@@ -774,7 +774,8 @@ void UpdatePredictionCacheImpl(Context const *ctx, RegTree const *p_last_tree,
       if (tree.IsLeaf(nidx)) {
         auto const &rowset = part[nidx];
         auto leaf_value = mttree->LeafValue(nidx);
-        for (std::size_t const *it = rowset.begin + r.begin(); it < rowset.begin + r.end(); ++it) {
+        for (bst_idx_t const *it = rowset.begin() + r.begin(); it < rowset.begin() + r.end();
+             ++it) {
           for (std::size_t i = 0; i < n_targets; ++i) {
             out_preds(*it, i) += leaf_value(i);
           }
diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h
index e28cae16597c..1e9dc9c7d53c 100644
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -76,13 +76,13 @@ class HistogramBuilder {
     common::ParallelFor2d(space, this->n_threads_, [&](size_t nid_in_set, common::Range1d r) {
       const auto tid = static_cast<unsigned>(omp_get_thread_num());
       bst_node_t const nidx = nodes_to_build[nid_in_set];
-      auto elem = row_set_collection[nidx];
+      auto const& elem = row_set_collection[nidx];
       auto start_of_row_set = std::min(r.begin(), elem.Size());
       auto end_of_row_set = std::min(r.end(), elem.Size());
-      auto rid_set = common::RowSetCollection::Elem(elem.begin + start_of_row_set,
-                                                    elem.begin + end_of_row_set, nidx);
+      auto rid_set = common::Span<bst_idx_t const>{elem.begin() + start_of_row_set,
+                                                   elem.begin() + end_of_row_set};
       auto hist = buffer_.GetInitializedHist(tid, nid_in_set);
-      if (rid_set.Size() != 0) {
+      if (rid_set.size() != 0) {
         common::BuildHist<any_missing>(gpair_h, rid_set, gidx, hist, force_read_by_column);
       }
     });
diff --git a/tests/cpp/common/test_partition_builder.cc b/tests/cpp/common/test_partition_builder.cc
index 08dd345f261f..36fb7a8d9870 100644
--- a/tests/cpp/common/test_partition_builder.cc
+++ b/tests/cpp/common/test_partition_builder.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023 by XGBoost contributors
+ * Copyright 2020-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 
@@ -58,7 +58,7 @@ TEST(PartitionBuilder, BasicTest) {
   }
   builder.CalculateRowOffsets();
 
-  std::vector<size_t> v(*std::max_element(tasks.begin(), tasks.end()) * kBlockSize);
+  std::vector<bst_idx_t> v(*std::max_element(tasks.begin(), tasks.end()) * kBlockSize);
 
   for(size_t nid = 0; nid < kNodes; ++nid) {
 
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index 329379b5b4d6..b7aae1b57e5b 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -45,7 +45,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
   // dense, no missing values
   GHistIndexMatrix gmat(&ctx, dmat.get(), kMaxBins, 0.5, false);
   common::RowSetCollection row_set_collection;
-  std::vector<size_t> &row_indices = *row_set_collection.Data();
+  std::vector<bst_idx_t> &row_indices = *row_set_collection.Data();
   row_indices.resize(kRows);
   std::iota(row_indices.begin(), row_indices.end(), 0);
   row_set_collection.Init();
@@ -53,7 +53,9 @@ void TestEvaluateSplits(bool force_read_by_column) {
   HistMakerTrainParam hist_param;
   hist.Reset(gmat.cut.Ptrs().back(), hist_param.max_cached_hist_node);
   hist.AllocateHistograms({0});
-  common::BuildHist<false>(row_gpairs, row_set_collection[0], gmat, hist[0], force_read_by_column);
+  auto const &elem = row_set_collection[0];
+  common::BuildHist<false>(row_gpairs, common::Span{elem.begin(), elem.end()}, gmat, hist[0],
+                           force_read_by_column);
 
   // Compute total gradient for all data points
   GradientPairPrecise total_gpair;
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index 740175c577c5..88c8d1cf6e64 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -14,7 +14,6 @@
 #include <algorithm>   // for max
 #include <cstddef>     // for size_t
 #include <cstdint>     // for int32_t, uint32_t
-#include <functional>  // for function
 #include <iterator>    // for back_inserter
 #include <limits>      // for numeric_limits
 #include <memory>      // for shared_ptr, allocator, unique_ptr
@@ -108,7 +107,7 @@ void TestSyncHist(bool is_distributed) {
   common::RowSetCollection row_set_collection;
   {
     row_set_collection.Clear();
-    std::vector<size_t> &row_indices = *row_set_collection.Data();
+    std::vector<bst_idx_t> &row_indices = *row_set_collection.Data();
     row_indices.resize(kNRows);
     std::iota(row_indices.begin(), row_indices.end(), 0);
     row_set_collection.Init();
@@ -251,7 +250,7 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
 
   common::RowSetCollection row_set_collection;
   row_set_collection.Clear();
-  std::vector<size_t> &row_indices = *row_set_collection.Data();
+  std::vector<bst_idx_t> &row_indices = *row_set_collection.Data();
   row_indices.resize(kNRows);
   std::iota(row_indices.begin(), row_indices.end(), 0);
   row_set_collection.Init();
@@ -345,7 +344,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
 
   common::RowSetCollection row_set_collection;
   row_set_collection.Clear();
-  std::vector<size_t> &row_indices = *row_set_collection.Data();
+  std::vector<bst_idx_t> &row_indices = *row_set_collection.Data();
   row_indices.resize(kRows);
   std::iota(row_indices.begin(), row_indices.end(), 0);
   row_set_collection.Init();
diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc
index b2949e5952a2..d647d3a970bf 100644
--- a/tests/cpp/tree/test_approx.cc
+++ b/tests/cpp/tree/test_approx.cc
@@ -3,7 +3,6 @@
  */
 #include <gtest/gtest.h>
 
-#include "../../../src/common/numeric.h"
 #include "../../../src/tree/common_row_partitioner.h"
 #include "../collective/test_worker.h"  // for TestDistributedGlobal
 #include "../helpers.h"
@@ -54,20 +53,23 @@ TEST(Approx, Partitioner) {
       GetSplit(&tree, split_value, &candidates);
       partitioner.UpdatePosition(&ctx, page, candidates, &tree);
 
-      auto left_nidx = tree[RegTree::kRoot].LeftChild();
-      auto elem = partitioner[left_nidx];
-      ASSERT_LT(elem.Size(), n_samples);
-      ASSERT_GT(elem.Size(), 1);
-      for (auto it = elem.begin; it != elem.end; ++it) {
-        auto value = page.cut.Values().at(page.index[*it]);
-        ASSERT_LE(value, split_value);
+      {
+        auto left_nidx = tree[RegTree::kRoot].LeftChild();
+        auto const& elem = partitioner[left_nidx];
+        ASSERT_LT(elem.Size(), n_samples);
+        ASSERT_GT(elem.Size(), 1);
+        for (auto& it : elem) {
+          auto value = page.cut.Values().at(page.index[it]);
+          ASSERT_LE(value, split_value);
+        }
       }
-
-      auto right_nidx = tree[RegTree::kRoot].RightChild();
-      elem = partitioner[right_nidx];
-      for (auto it = elem.begin; it != elem.end; ++it) {
-        auto value = page.cut.Values().at(page.index[*it]);
-        ASSERT_GT(value, split_value) << *it;
+      {
+        auto right_nidx = tree[RegTree::kRoot].RightChild();
+        auto const& elem = partitioner[right_nidx];
+        for (auto& it : elem) {
+          auto value = page.cut.Values().at(page.index[it]);
+          ASSERT_GT(value, split_value) << it;
+        }
       }
     }
   }
@@ -99,23 +101,25 @@ void TestColumnSplitPartitioner(size_t n_samples, size_t base_rowid, std::shared
       RegTree tree;
       GetSplit(&tree, mid_value, &candidates);
       partitioner.UpdatePosition(&ctx, page, candidates, &tree);
-
-      auto left_nidx = tree[RegTree::kRoot].LeftChild();
-      auto elem = partitioner[left_nidx];
-      ASSERT_LT(elem.Size(), n_samples);
-      ASSERT_GT(elem.Size(), 1);
-      auto expected_elem = expected_mid_partitioner[left_nidx];
-      ASSERT_EQ(elem.Size(), expected_elem.Size());
-      for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) {
-        ASSERT_EQ(*it, *eit);
+      {
+        auto left_nidx = tree[RegTree::kRoot].LeftChild();
+        auto const& elem = partitioner[left_nidx];
+        ASSERT_LT(elem.Size(), n_samples);
+        ASSERT_GT(elem.Size(), 1);
+        auto const& expected_elem = expected_mid_partitioner[left_nidx];
+        ASSERT_EQ(elem.Size(), expected_elem.Size());
+        for (auto it = elem.begin(), eit = expected_elem.begin(); it != elem.end(); ++it, ++eit) {
+          ASSERT_EQ(*it, *eit);
+        }
       }
-
-      auto right_nidx = tree[RegTree::kRoot].RightChild();
-      elem = partitioner[right_nidx];
-      expected_elem = expected_mid_partitioner[right_nidx];
-      ASSERT_EQ(elem.Size(), expected_elem.Size());
-      for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) {
-        ASSERT_EQ(*it, *eit);
+      {
+        auto right_nidx = tree[RegTree::kRoot].RightChild();
+        auto const& elem = partitioner[right_nidx];
+        auto const& expected_elem = expected_mid_partitioner[right_nidx];
+        ASSERT_EQ(elem.Size(), expected_elem.Size());
+        for (auto it = elem.begin(), eit = expected_elem.begin(); it != elem.end(); ++it, ++eit) {
+          ASSERT_EQ(*it, *eit);
+        }
       }
     }
   }
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index ce637caa4d46..29ae02f8d2b2 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -5,7 +5,6 @@
 #include <xgboost/host_device_vector.h>
 #include <xgboost/tree_updater.h>
 
-#include <algorithm>
 #include <cstddef>  // for size_t
 #include <string>
 #include <vector>
@@ -68,21 +67,24 @@ void TestPartitioner(bst_target_t n_targets) {
       } else {
         GetMultiSplitForTest(&tree, split_value, &candidates);
       }
-      auto left_nidx = tree.LeftChild(RegTree::kRoot);
       partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
-
-      auto elem = partitioner[left_nidx];
-      ASSERT_LT(elem.Size(), n_samples);
-      ASSERT_GT(elem.Size(), 1);
-      for (auto it = elem.begin; it != elem.end; ++it) {
-        auto value = gmat.cut.Values().at(gmat.index[*it]);
-        ASSERT_LE(value, split_value);
+      {
+        auto left_nidx = tree.LeftChild(RegTree::kRoot);
+        auto const& elem = partitioner[left_nidx];
+        ASSERT_LT(elem.Size(), n_samples);
+        ASSERT_GT(elem.Size(), 1);
+        for (auto& it : elem) {
+          auto value = gmat.cut.Values().at(gmat.index[it]);
+          ASSERT_LE(value, split_value);
+        }
       }
-      auto right_nidx = tree.RightChild(RegTree::kRoot);
-      elem = partitioner[right_nidx];
-      for (auto it = elem.begin; it != elem.end; ++it) {
-        auto value = gmat.cut.Values().at(gmat.index[*it]);
-        ASSERT_GT(value, split_value);
+      {
+        auto right_nidx = tree.RightChild(RegTree::kRoot);
+        auto const& elem = partitioner[right_nidx];
+        for (auto& it : elem) {
+          auto value = gmat.cut.Values().at(gmat.index[it]);
+          ASSERT_GT(value, split_value);
+        }
       }
     }
   }
@@ -138,21 +140,24 @@ void VerifyColumnSplitPartitioner(bst_target_t n_targets, size_t n_samples,
       auto left_nidx = tree.LeftChild(RegTree::kRoot);
       partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
 
-      auto elem = partitioner[left_nidx];
-      ASSERT_LT(elem.Size(), n_samples);
-      ASSERT_GT(elem.Size(), 1);
-      auto expected_elem = expected_mid_partitioner[left_nidx];
-      ASSERT_EQ(elem.Size(), expected_elem.Size());
-      for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) {
-        ASSERT_EQ(*it, *eit);
+      {
+        auto const& elem = partitioner[left_nidx];
+        ASSERT_LT(elem.Size(), n_samples);
+        ASSERT_GT(elem.Size(), 1);
+        auto const& expected_elem = expected_mid_partitioner[left_nidx];
+        ASSERT_EQ(elem.Size(), expected_elem.Size());
+        for (auto it = elem.begin(), eit = expected_elem.begin(); it != elem.end(); ++it, ++eit) {
+          ASSERT_EQ(*it, *eit);
+        }
       }
-
-      auto right_nidx = tree.RightChild(RegTree::kRoot);
-      elem = partitioner[right_nidx];
-      expected_elem = expected_mid_partitioner[right_nidx];
-      ASSERT_EQ(elem.Size(), expected_elem.Size());
-      for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) {
-        ASSERT_EQ(*it, *eit);
+      {
+        auto right_nidx = tree.RightChild(RegTree::kRoot);
+        auto const& elem = partitioner[right_nidx];
+        auto const& expected_elem = expected_mid_partitioner[right_nidx];
+        ASSERT_EQ(elem.Size(), expected_elem.Size());
+        for (auto it = elem.begin(), eit = expected_elem.begin(); it != elem.end(); ++it, ++eit) {
+          ASSERT_EQ(*it, *eit);
+        }
       }
     }
   }

From 5f910cd4fff898b8fc367dbb722c47467b5e6acd Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 11 Jul 2024 03:26:30 +0800
Subject: [PATCH 17/51] [EM] Handle base idx in GPU histogram. (#10549)

---
 src/tree/gpu_hist/histogram.cu                |  42 +++----
 src/tree/gpu_hist/histogram.cuh               |   1 -
 src/tree/gpu_hist/row_partitioner.cu          |  33 ++---
 src/tree/gpu_hist/row_partitioner.cuh         |  25 ++--
 src/tree/updater_gpu_hist.cu                  |   3 +-
 tests/cpp/tree/gpu_hist/test_histogram.cu     | 115 +++++++++++++++++-
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  17 ++-
 tests/cpp/tree/test_gpu_hist.cu               |   2 +-
 8 files changed, 167 insertions(+), 71 deletions(-)

diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index cd848c1c0cae..372a5c09ba0c 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -1,8 +1,7 @@
 /**
  * Copyright 2020-2024, XGBoost Contributors
  */
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/reduce.h>
+#include <thrust/iterator/transform_iterator.h>  // for make_transform_iterator
 
 #include <algorithm>
 #include <cstdint>  // uint32_t, int32_t
@@ -101,9 +100,8 @@ GradientQuantiser::GradientQuantiser(Context const* ctx, common::Span<GradientPa
                                  static_cast<T>(1) / to_floating_point_.GetHess());
 }
 
-XGBOOST_DEV_INLINE void
-AtomicAddGpairShared(xgboost::GradientPairInt64 *dest,
-               xgboost::GradientPairInt64 const &gpair) {
+XGBOOST_DEV_INLINE void AtomicAddGpairShared(xgboost::GradientPairInt64* dest,
+                                             xgboost::GradientPairInt64 const& gpair) {
   auto dst_ptr = reinterpret_cast<int64_t *>(dest);
   auto g = gpair.GetQuantisedGrad();
   auto h = gpair.GetQuantisedHess();
@@ -131,7 +129,9 @@ template <int kBlockThreads, int kItemsPerThread,
 class HistogramAgent {
   GradientPairInt64* smem_arr_;
   GradientPairInt64* d_node_hist_;
-  dh::LDGIterator<const RowPartitioner::RowIndexT> d_ridx_;
+  using Idx = RowPartitioner::RowIndexT;
+
+  dh::LDGIterator<const Idx> d_ridx_;
   const GradientPair* d_gpair_;
   const FeatureGroup group_;
   const EllpackDeviceAccessor& matrix_;
@@ -142,8 +142,7 @@ class HistogramAgent {
  public:
   __device__ HistogramAgent(GradientPairInt64* smem_arr,
                             GradientPairInt64* __restrict__ d_node_hist, const FeatureGroup& group,
-                            const EllpackDeviceAccessor& matrix,
-                            common::Span<const RowPartitioner::RowIndexT> d_ridx,
+                            const EllpackDeviceAccessor& matrix, common::Span<const Idx> d_ridx,
                             const GradientQuantiser& rounding, const GradientPair* d_gpair)
       : smem_arr_(smem_arr),
         d_node_hist_(d_node_hist),
@@ -154,15 +153,15 @@ class HistogramAgent {
         n_elements_(feature_stride_ * d_ridx.size()),
         rounding_(rounding),
         d_gpair_(d_gpair) {}
+
   __device__ void ProcessPartialTileShared(std::size_t offset) {
     for (std::size_t idx = offset + threadIdx.x;
          idx < std::min(offset + kBlockThreads * kItemsPerTile, n_elements_);
          idx += kBlockThreads) {
-      int ridx = d_ridx_[idx / feature_stride_];
-      int gidx =
-          matrix_
-              .gidx_iter[ridx * matrix_.row_stride + group_.start_feature + idx % feature_stride_] -
-          group_.start_bin;
+      Idx ridx = d_ridx_[idx / feature_stride_];
+      Idx midx = (ridx - matrix_.base_rowid) * matrix_.row_stride + group_.start_feature +
+                  idx % feature_stride_;
+      bst_bin_t gidx = matrix_.gidx_iter[midx] - group_.start_bin;
       if (matrix_.is_dense || gidx != matrix_.NumBins()) {
         auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]);
         AtomicAddGpairShared(smem_arr_ + gidx, adjusted);
@@ -188,8 +187,8 @@ class HistogramAgent {
 #pragma unroll
     for (int i = 0; i < kItemsPerThread; i++) {
       gpair[i] = d_gpair_[ridx[i]];
-      gidx[i] = matrix_.gidx_iter[ridx[i] * matrix_.row_stride + group_.start_feature +
-                                 idx[i] % feature_stride_];
+      gidx[i] = matrix_.gidx_iter[(ridx[i] - matrix_.base_rowid) * matrix_.row_stride +
+                                  group_.start_feature + idx[i] % feature_stride_];
     }
 #pragma unroll
     for (int i = 0; i < kItemsPerThread; i++) {
@@ -200,7 +199,7 @@ class HistogramAgent {
     }
   }
   __device__ void BuildHistogramWithShared() {
-    dh::BlockFill(smem_arr_, group_.num_bins, GradientPairInt64());
+    dh::BlockFill(smem_arr_, group_.num_bins, GradientPairInt64{});
     __syncthreads();
 
     std::size_t offset = blockIdx.x * kItemsPerTile;
@@ -219,10 +218,9 @@ class HistogramAgent {
 
   __device__ void BuildHistogramWithGlobal() {
     for (auto idx : dh::GridStrideRange(static_cast<std::size_t>(0), n_elements_)) {
-      int ridx = d_ridx_[idx / feature_stride_];
-      int gidx =
-          matrix_
-              .gidx_iter[ridx * matrix_.row_stride + group_.start_feature + idx % feature_stride_];
+      Idx ridx = d_ridx_[idx / feature_stride_];
+      bst_bin_t gidx = matrix_.gidx_iter[(ridx - matrix_.base_rowid) * matrix_.row_stride +
+                                         group_.start_feature + idx % feature_stride_];
       if (matrix_.is_dense || gidx != matrix_.NumBins()) {
         auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]);
         AtomicAddGpairGlobal(d_node_hist_ + gidx, adjusted);
@@ -231,8 +229,7 @@ class HistogramAgent {
   }
 };
 
-template <bool use_shared_memory_histograms, int kBlockThreads,
-          int kItemsPerThread>
+template <bool use_shared_memory_histograms, int kBlockThreads, int kItemsPerThread>
 __global__ void __launch_bounds__(kBlockThreads)
     SharedMemHistKernel(const EllpackDeviceAccessor matrix,
                         const FeatureGroupsAccessor feature_groups,
@@ -251,6 +248,7 @@ __global__ void __launch_bounds__(kBlockThreads)
     agent.BuildHistogramWithGlobal();
   }
 }
+
 namespace {
 constexpr std::int32_t kBlockThreads = 1024;
 constexpr std::int32_t kItemsPerThread = 8;
diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
index e30f682082b7..862821b00b63 100644
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -78,5 +78,4 @@ class DeviceHistogramBuilder {
                       common::Span<GradientPairInt64> histogram, GradientQuantiser rounding);
 };
 }  // namespace xgboost::tree
-
 #endif  // HISTOGRAM_CUH_
diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 35b43d24bd08..f66fac489da3 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -1,28 +1,23 @@
-/*!
- * Copyright 2017-2022 XGBoost contributors
+/**
+ * Copyright 2017-2024, XGBoost contributors
  */
-#include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/sequence.h>
+#include <thrust/sequence.h>  // for sequence
 
-#include <vector>
+#include <vector>  // for vector
 
-#include "../../common/device_helpers.cuh"
+#include "../../common/cuda_context.cuh"    // for CUDAContext
+#include "../../common/device_helpers.cuh"  // for CopyDeviceSpanToVector, ToSpan
 #include "row_partitioner.cuh"
 
-namespace xgboost {
-namespace tree {
-
-RowPartitioner::RowPartitioner(DeviceOrd device_idx, size_t num_rows)
-    : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
+namespace xgboost::tree {
+RowPartitioner::RowPartitioner(Context const* ctx, bst_idx_t n_samples, bst_idx_t base_rowid)
+    : device_idx_(ctx->Device()), ridx_(n_samples), ridx_tmp_(n_samples) {
   dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
-  ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
-  thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
+  ridx_segments_.emplace_back(NodePositionInfo{Segment(0, n_samples)});
+  thrust::sequence(ctx->CUDACtx()->CTP(), ridx_.data(), ridx_.data() + ridx_.size(), base_rowid);
 }
 
-RowPartitioner::~RowPartitioner() {
-  dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
-}
+RowPartitioner::~RowPartitioner() { dh::safe_cuda(cudaSetDevice(device_idx_.ordinal)); }
 
 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
   auto segment = ridx_segments_.at(nidx).segment;
@@ -39,6 +34,4 @@ std::vector<RowPartitioner::RowIndexT> RowPartitioner::GetRowsHost(bst_node_t ni
   dh::CopyDeviceSpanToVector(&rows, span);
   return rows;
 }
-
-};  // namespace tree
-};  // namespace xgboost
+};  // namespace xgboost::tree
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index fde6c4dd0fa9..636de54e6c25 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -1,17 +1,17 @@
-/*!
- * Copyright 2017-2022 XGBoost contributors
+/**
+ * Copyright 2017-2024, XGBoost contributors
  */
 #pragma once
 #include <thrust/execution_policy.h>
+#include <thrust/iterator/counting_iterator.h>          // for make_counting_iterator
+#include <thrust/iterator/transform_output_iterator.h>  // for make_transform_output_iterator
 
-#include <limits>
-#include <vector>
+#include <algorithm>  // for max
+#include <vector>     // for vector
 
-#include "../../common/device_helpers.cuh"
-#include "xgboost/base.h"
-#include "xgboost/context.h"
-#include "xgboost/task.h"
-#include "xgboost/tree_model.h"
+#include "../../common/device_helpers.cuh"  // for MakeTransformIterator
+#include "xgboost/base.h"                   // for bst_idx_t
+#include "xgboost/context.h"                // for Context
 
 namespace xgboost {
 namespace tree {
@@ -223,7 +223,12 @@ class RowPartitioner {
   dh::PinnedMemory pinned2_;
 
  public:
-  RowPartitioner(DeviceOrd device_idx, size_t num_rows);
+  /**
+   * @param ctx Context for device ordinal and stream.
+   * @param n_samples The number of samples in each batch.
+   * @param base_rowid The base row index for the current batch.
+   */
+  RowPartitioner(Context const* ctx, bst_idx_t n_samples, bst_idx_t base_rowid);
   ~RowPartitioner();
   RowPartitioner(const RowPartitioner&) = delete;
   RowPartitioner& operator=(const RowPartitioner&) = delete;
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index aa4f8fa27218..366cf3aad08e 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -251,7 +251,8 @@ struct GPUHistMakerDevice {
     quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, dmat->Info());
 
     row_partitioner.reset();  // Release the device memory first before reallocating
-    row_partitioner = std::make_unique<RowPartitioner>(ctx_->Device(), sample.sample_rows);
+    CHECK_EQ(page->base_rowid, 0);
+    row_partitioner = std::make_unique<RowPartitioner>(ctx_, sample.sample_rows, page->base_rowid);
 
     // Init histogram
     hist.Init(ctx_->Device(), page->Cuts().TotalBins());
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 3b9e6103a57e..d1128446617b 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -2,13 +2,15 @@
  * Copyright 2020-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
+#include <xgboost/context.h>  // for Context
 
-#include <vector>
+#include <memory>  // for unique_ptr
+#include <vector>  // for vector
 
 #include "../../../../src/tree/gpu_hist/histogram.cuh"
-#include "../../../../src/tree/gpu_hist/row_partitioner.cuh"
-#include "../../../../src/tree/param.h"  // TrainParam
-#include "../../categorical_helpers.h"
+#include "../../../../src/tree/gpu_hist/row_partitioner.cuh"  // for RowPartitioner
+#include "../../../../src/tree/param.h"                       // for TrainParam
+#include "../../categorical_helpers.h"                        // for OneHotEncodeFeature
 #include "../../helpers.h"
 
 namespace xgboost::tree {
@@ -24,7 +26,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)
   for (auto const& batch : matrix->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
 
-    tree::RowPartitioner row_partitioner(ctx.Device(), kRows);
+    tree::RowPartitioner row_partitioner{&ctx, kRows, page->base_rowid};
     auto ridx = row_partitioner.GetRows(0);
 
     bst_bin_t num_bins = kBins * kCols;
@@ -129,7 +131,7 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   auto cat_m = GetDMatrixFromData(x, kRows, 1);
   cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
   auto batch_param = BatchParam{kBins, tree::TrainParam::DftSparseThreshold()};
-  tree::RowPartitioner row_partitioner(ctx.Device(), kRows);
+  tree::RowPartitioner row_partitioner{&ctx, kRows, 0};
   auto ridx = row_partitioner.GetRows(0);
   dh::device_vector<GradientPairInt64> cat_hist(num_categories);
   auto gpair = GenerateRandomGradients(kRows, 0, 2);
@@ -262,4 +264,105 @@ TEST(Histogram, Quantiser) {
     ASSERT_EQ(gh.GetHess(), 1.0);
   }
 }
+namespace {
+class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<float, bool>> {
+ public:
+  void Run(float sparsity, bool force_global) {
+    bst_idx_t n_samples{512}, n_features{12}, n_batches{3};
+    std::vector<std::unique_ptr<RowPartitioner>> partitioners;
+    auto p_fmat = RandomDataGenerator{n_samples, n_features, sparsity}
+                      .Batches(n_batches)
+                      .GenerateSparsePageDMatrix("cache", true);
+    bst_bin_t n_bins = 16;
+    BatchParam p{n_bins, TrainParam::DftSparseThreshold()};
+    auto ctx = MakeCUDACtx(0);
+
+    std::unique_ptr<FeatureGroups> fg;
+    dh::device_vector<GradientPairInt64> single_hist;
+    dh::device_vector<GradientPairInt64> multi_hist;
+
+    auto gpair = GenerateRandomGradients(n_samples);
+    gpair.SetDevice(ctx.Device());
+    auto quantiser = GradientQuantiser{&ctx, gpair.ConstDeviceSpan(), p_fmat->Info()};
+    std::shared_ptr<common::HistogramCuts> cuts;
+
+    {
+      /**
+       * Multi page.
+       */
+      std::int32_t k{0};
+      for (auto const& page : p_fmat->GetBatches<EllpackPage>(&ctx, p)) {
+        auto impl = page.Impl();
+        if (k == 0) {
+          // Initialization
+          auto d_matrix = impl->GetDeviceAccessor(ctx.Device());
+          fg = std::make_unique<FeatureGroups>(impl->Cuts());
+          auto init = GradientPairInt64{0, 0};
+          multi_hist = decltype(multi_hist)(impl->Cuts().TotalBins(), init);
+          single_hist = decltype(single_hist)(impl->Cuts().TotalBins(), init);
+          cuts = std::make_shared<common::HistogramCuts>(impl->Cuts());
+        }
+
+        partitioners.emplace_back(
+            std::make_unique<RowPartitioner>(&ctx, impl->Size(), impl->base_rowid));
+
+        auto ridx = partitioners.at(k)->GetRows(0);
+        auto d_histogram = dh::ToSpan(multi_hist);
+        DeviceHistogramBuilder builder;
+        builder.Reset(&ctx, fg->DeviceAccessor(ctx.Device()), force_global);
+        builder.BuildHistogram(ctx.CUDACtx(), impl->GetDeviceAccessor(ctx.Device()),
+                               fg->DeviceAccessor(ctx.Device()), gpair.ConstDeviceSpan(), ridx,
+                               d_histogram, quantiser);
+        ++k;
+      }
+      ASSERT_EQ(k, n_batches);
+    }
+
+    {
+      /**
+       * Single page.
+       */
+      RowPartitioner partitioner{&ctx, p_fmat->Info().num_row_, 0};
+      SparsePage concat;
+      std::vector<float> hess(p_fmat->Info().num_row_, 1.0f);
+      for (auto const& page : p_fmat->GetBatches<SparsePage>()) {
+        concat.Push(page);
+      }
+      EllpackPageImpl page{
+          ctx.Device(), cuts, concat, p_fmat->IsDense(), p_fmat->Info().num_col_, {}};
+      auto ridx = partitioner.GetRows(0);
+      auto d_histogram = dh::ToSpan(single_hist);
+      DeviceHistogramBuilder builder;
+      builder.Reset(&ctx, fg->DeviceAccessor(ctx.Device()), force_global);
+      builder.BuildHistogram(ctx.CUDACtx(), page.GetDeviceAccessor(ctx.Device()),
+                             fg->DeviceAccessor(ctx.Device()), gpair.ConstDeviceSpan(), ridx,
+                             d_histogram, quantiser);
+    }
+
+    std::vector<GradientPairInt64> h_single(single_hist.size());
+    thrust::copy(single_hist.begin(), single_hist.end(), h_single.begin());
+    std::vector<GradientPairInt64> h_multi(multi_hist.size());
+    thrust::copy(multi_hist.begin(), multi_hist.end(), h_multi.begin());
+
+    for (std::size_t i = 0; i < single_hist.size(); ++i) {
+      ASSERT_EQ(h_single[i].GetQuantisedGrad(), h_multi[i].GetQuantisedGrad());
+      ASSERT_EQ(h_single[i].GetQuantisedHess(), h_multi[i].GetQuantisedHess());
+    }
+  }
+};
+}  // namespace
+
+TEST_P(HistogramExternalMemoryTest, ExternalMemory) {
+  std::apply(&HistogramExternalMemoryTest::Run, std::tuple_cat(std::make_tuple(this), GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(Histogram, HistogramExternalMemoryTest, ::testing::ValuesIn([]() {
+                           std::vector<std::tuple<float, bool>> params;
+                           for (auto global : {true, false}) {
+                             for (auto sparsity : {0.0f, 0.2f, 0.8f}) {
+                               params.emplace_back(sparsity, global);
+                             }
+                           }
+                           return params;
+                         }()));
 }  // namespace xgboost::tree
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 14ea6fd70a4e..cf0d505d103d 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -1,25 +1,22 @@
-/*!
- * Copyright 2019-2022 by XGBoost Contributors
+/**
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/sequence.h>
 
-#include <algorithm>
-#include <vector>
+#include <cstddef>  // for size_t
+#include <cstdint>  // for uint32_t
+#include <vector>   // for vector
 
 #include "../../../../src/tree/gpu_hist/row_partitioner.cuh"
 #include "../../helpers.h"
 #include "xgboost/base.h"
-#include "xgboost/context.h"
-#include "xgboost/task.h"
-#include "xgboost/tree_model.h"
 
 namespace xgboost::tree {
 void TestUpdatePositionBatch() {
   const int kNumRows = 10;
-  RowPartitioner rp(FstCU(), kNumRows);
+  auto ctx = MakeCUDACtx(0);
+  RowPartitioner rp{&ctx, kNumRows, 0};
   auto rows = rp.GetRowsHost(0);
   EXPECT_EQ(rows.size(), kNumRows);
   for (auto i = 0ull; i < kNumRows; i++) {
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 1c156563cda3..200fb39fb4e9 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -106,7 +106,7 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   gpair.SetDevice(ctx.Device());
 
   thrust::host_vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.HostVector());
-  maker.row_partitioner = std::make_unique<RowPartitioner>(ctx.Device(), kNRows);
+  maker.row_partitioner = std::make_unique<RowPartitioner>(&ctx, kNRows, 0);
 
   maker.hist.Init(ctx.Device(), page->Cuts().TotalBins());
   maker.hist.AllocateHistograms({0});

From 89da9f974162261b3334e6752a7f3bbe210236bb Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 11 Jul 2024 13:09:18 +0800
Subject: [PATCH 18/51] [fed] Split up federated test CMake file. (#10566)

- Collect all federated test files into the same directory.
- Independently list the files.
---
 tests/cpp/CMakeLists.txt                      | 21 ++++++++++++-------
 tests/cpp/plugin/federated/CMakeLists.txt     | 20 ++++++++++++++++++
 .../{ => federated}/test_federated_data.cc    | 14 ++++++-------
 .../{ => federated}/test_federated_learner.cc | 13 ++++++------
 4 files changed, 46 insertions(+), 22 deletions(-)
 create mode 100644 tests/cpp/plugin/federated/CMakeLists.txt
 rename tests/cpp/plugin/{ => federated}/test_federated_data.cc (85%)
 rename tests/cpp/plugin/{ => federated}/test_federated_learner.cc (92%)

diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index 6496f8af45de..deed08165bc2 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -1,3 +1,6 @@
+# The testxgboost executable is created in the top level CMakeLists. Most of the
+# properties and compilation flags are already set. We just need to add source files and
+# link gtest here.
 if(USE_DMLC_GTEST)
   if(NOT TARGET gtest)
     message(FATAL_ERROR "USE_DMLC_GTEST=ON but dmlc-core didn't bundle gtest")
@@ -6,6 +9,7 @@ if(USE_DMLC_GTEST)
 else()
   find_package(GTest REQUIRED)
 endif()
+
 file(GLOB_RECURSE TEST_SOURCES "*.cc")
 
 if(USE_CUDA)
@@ -13,6 +17,10 @@ if(USE_CUDA)
   list(APPEND TEST_SOURCES ${CUDA_TEST_SOURCES})
 endif()
 
+# We will add them back later to separate the definition.
+file(GLOB_RECURSE FEDERATED_TEST_SOURCES "plugin/federated/*.*")
+list(REMOVE_ITEM TEST_SOURCES ${FEDERATED_TEST_SOURCES})
+
 file(GLOB_RECURSE SYCL_TEST_SOURCES "plugin/test_sycl_*.cc")
 list(REMOVE_ITEM TEST_SOURCES ${SYCL_TEST_SOURCES})
 
@@ -48,14 +56,14 @@ if(PLUGIN_SYCL)
 endif()
 
 if(PLUGIN_FEDERATED)
-  target_include_directories(testxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/federated)
-  target_link_libraries(testxgboost PRIVATE federated_client)
-else()
-  file(GLOB_RECURSE FEDERATED_TEST_SOURCES "plugin/*_federated_*.*")
-  list(REMOVE_ITEM TEST_SOURCES ${FEDERATED_TEST_SOURCES})
+  add_subdirectory(${xgboost_SOURCE_DIR}/tests/cpp/plugin/federated)
 endif()
 
-target_sources(testxgboost PRIVATE ${TEST_SOURCES} ${xgboost_SOURCE_DIR}/plugin/example/custom_obj.cc)
+target_sources(
+  testxgboost PRIVATE
+  ${TEST_SOURCES}
+  ${xgboost_SOURCE_DIR}/plugin/example/custom_obj.cc
+)
 
 if(USE_CUDA AND PLUGIN_RMM)
   target_include_directories(testxgboost PRIVATE ${CUDA_INCLUDE_DIRS})
@@ -63,7 +71,6 @@ endif()
 
 target_include_directories(testxgboost
   PRIVATE
-  ${GTEST_INCLUDE_DIRS}
   ${xgboost_SOURCE_DIR}/include
   ${xgboost_SOURCE_DIR}/dmlc-core/include)
 target_link_libraries(testxgboost
diff --git a/tests/cpp/plugin/federated/CMakeLists.txt b/tests/cpp/plugin/federated/CMakeLists.txt
new file mode 100644
index 000000000000..f85304e31ac1
--- /dev/null
+++ b/tests/cpp/plugin/federated/CMakeLists.txt
@@ -0,0 +1,20 @@
+target_sources(
+  testxgboost PRIVATE
+  ${xgboost_SOURCE_DIR}/tests/cpp/plugin/federated/test_federated_coll.cc
+  ${xgboost_SOURCE_DIR}/tests/cpp/plugin/federated/test_federated_comm.cc
+  ${xgboost_SOURCE_DIR}/tests/cpp/plugin/federated/test_federated_comm_group.cc
+  ${xgboost_SOURCE_DIR}/tests/cpp/plugin/federated/test_federated_tracker.cc
+  ${xgboost_SOURCE_DIR}/tests/cpp/plugin/federated/test_federated_learner.cc
+  ${xgboost_SOURCE_DIR}/tests/cpp/plugin/federated/test_federated_data.cc
+)
+
+if(USE_CUDA)
+  target_sources(
+    testxgboost PRIVATE
+    ${xgboost_SOURCE_DIR}/tests/cpp/plugin/federated/test_federated_coll.cu
+    ${xgboost_SOURCE_DIR}/tests/cpp/plugin/federated/test_federated_comm_group.cu
+  )
+endif()
+
+target_include_directories(testxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/federated)
+target_link_libraries(testxgboost PRIVATE federated_client)
diff --git a/tests/cpp/plugin/test_federated_data.cc b/tests/cpp/plugin/federated/test_federated_data.cc
similarity index 85%
rename from tests/cpp/plugin/test_federated_data.cc
rename to tests/cpp/plugin/federated/test_federated_data.cc
index d0f649152bd4..664f85ac8926 100644
--- a/tests/cpp/plugin/test_federated_data.cc
+++ b/tests/cpp/plugin/federated/test_federated_data.cc
@@ -1,15 +1,13 @@
-/*!
- * Copyright 2023 XGBoost contributors
+/**
+ * Copyright 2023-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/data.h>
 
-#include <thread>
-
-#include "../../../src/collective/communicator-inl.h"
-#include "../filesystem.h"
-#include "../helpers.h"
-#include "federated/test_worker.h"
+#include "../../../../src/collective/communicator-inl.h"
+#include "../../filesystem.h"
+#include "../../helpers.h"
+#include "test_worker.h"
 
 namespace xgboost {
 
diff --git a/tests/cpp/plugin/test_federated_learner.cc b/tests/cpp/plugin/federated/test_federated_learner.cc
similarity index 92%
rename from tests/cpp/plugin/test_federated_learner.cc
rename to tests/cpp/plugin/federated/test_federated_learner.cc
index 948914e0fa6b..ed0bbcb3b749 100644
--- a/tests/cpp/plugin/test_federated_learner.cc
+++ b/tests/cpp/plugin/federated/test_federated_learner.cc
@@ -1,19 +1,18 @@
 /**
  * Copyright 2023-2024, XGBoost contributors
  *
- * Some other tests for federated learning are in the main test suite (test_learner.cc),
- * gaurded by the `XGBOOST_USE_FEDERATED`.
+ * Some other tests for federated learning are in the main test suite (test_learner.cc).
  */
 #include <dmlc/parameter.h>
 #include <gtest/gtest.h>
 #include <xgboost/data.h>
 #include <xgboost/objective.h>
 
-#include "../../../src/collective/communicator-inl.h"
-#include "../../../src/common/linalg_op.h"  // for begin, end
-#include "../helpers.h"
-#include "../objective_helpers.h"  // for MakeObjNamesForTest, ObjTestNameGenerator
-#include "federated/test_worker.h"
+#include "../../../../src/collective/communicator-inl.h"
+#include "../../../../src/common/linalg_op.h"  // for begin, end
+#include "../../helpers.h"
+#include "../../objective_helpers.h"  // for MakeObjNamesForTest, ObjTestNameGenerator
+#include "test_worker.h"
 
 namespace xgboost {
 namespace {

From 1ca4bfd20e442f0a26a90e5f65218bb4af0e7c16 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 11 Jul 2024 17:29:27 +0800
Subject: [PATCH 19/51] Avoid thrust vector initialization. (#10544)

* Avoid thrust vector initialization.

- Add a wrapper for rmm device uvector.
- Split up the `Resize` method for HDV.
---
 include/xgboost/host_device_vector.h        |   4 +-
 src/common/cuda_context.cuh                 |   2 +-
 src/common/device_helpers.cuh               | 260 ++-------------
 src/common/device_vector.cu                 |  27 ++
 src/common/device_vector.cuh                | 330 ++++++++++++++++++++
 src/common/host_device_vector.cc            |   5 +
 src/common/host_device_vector.cu            |  81 ++---
 src/common/quantile.cuh                     |  12 +-
 src/metric/auc.cu                           |   3 +-
 src/tree/updater_gpu_hist.cu                |   4 +-
 tests/cpp/common/test_device_vector.cu      |  21 ++
 tests/cpp/common/test_host_device_vector.cu |  39 ++-
 tests/cpp/data/test_array_interface.h       |  13 +-
 13 files changed, 510 insertions(+), 291 deletions(-)
 create mode 100644 src/common/device_vector.cu
 create mode 100644 src/common/device_vector.cuh
 create mode 100644 tests/cpp/common/test_device_vector.cu

diff --git a/include/xgboost/host_device_vector.h b/include/xgboost/host_device_vector.h
index 9a53d38583ca..36c7ed32b83d 100644
--- a/include/xgboost/host_device_vector.h
+++ b/include/xgboost/host_device_vector.h
@@ -135,7 +135,9 @@ class HostDeviceVector {
 
   void SetDevice(DeviceOrd device) const;
 
-  void Resize(size_t new_size, T v = T());
+  void Resize(std::size_t new_size);
+  /** @brief Resize and initialize the data if the new size is larger than the old size. */
+  void Resize(std::size_t new_size, T v);
 
   using value_type = T;  // NOLINT
 
diff --git a/src/common/cuda_context.cuh b/src/common/cuda_context.cuh
index c8b2e07927c9..7e1db8e3bf2f 100644
--- a/src/common/cuda_context.cuh
+++ b/src/common/cuda_context.cuh
@@ -18,7 +18,7 @@ struct CUDAContext {
    * \brief Caching thrust policy.
    */
   auto CTP() const {
-#if THRUST_MAJOR_VERSION >= 2
+#if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
     return thrust::cuda::par_nosync(caching_alloc_).on(dh::DefaultStream());
 #else
     return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream());
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 7cd00f6f6112..1754c9507036 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1,26 +1,21 @@
 /**
- * Copyright 2017-2023 XGBoost contributors
+ * Copyright 2017-2024, XGBoost contributors
  */
 #pragma once
-#include <thrust/binary_search.h>  // thrust::upper_bound
-#include <thrust/device_malloc_allocator.h>
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
+#include <thrust/binary_search.h>                       // thrust::upper_bound
+#include <thrust/device_ptr.h>                          // for device_ptr
+#include <thrust/device_vector.h>                       // for device_vector
 #include <thrust/execution_policy.h>                    // thrust::seq
-#include <thrust/gather.h>                              // gather
-#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/discard_iterator.h>           // for discard_iterator
 #include <thrust/iterator/transform_output_iterator.h>  // make_transform_output_iterator
-#include <thrust/logical.h>
-#include <thrust/sequence.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
-#include <thrust/transform_scan.h>
 #include <thrust/unique.h>
 
 #include <algorithm>
 #include <cstddef>  // for size_t
 #include <cub/cub.cuh>
-#include <cub/util_allocator.cuh>
+#include <cub/util_type.cuh>  // for UnitWord
 #include <sstream>
 #include <string>
 #include <tuple>
@@ -28,22 +23,14 @@
 
 #include "../collective/communicator-inl.h"
 #include "common.h"
+#include "device_vector.cuh"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/logging.h"
 #include "xgboost/span.h"
 
-#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-#include "rmm/mr/device/per_device_resource.hpp"
-#include "rmm/mr/device/thrust_allocator_adaptor.hpp"
-#include "rmm/version_config.hpp"
-
-#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
-#error "Please use RMM version 0.18 or later"
-#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18
-#error "Please use RMM version 0.18 or later"
-#endif  // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
-
-#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+#if defined(XGBOOST_USE_RMM)
+#include <rmm/exec_policy.hpp>
+#endif  // defined(XGBOOST_USE_RMM)
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
 
@@ -285,91 +272,6 @@ void Iota(Container array, cudaStream_t stream) {
   LaunchN(array.size(), stream, [=] __device__(size_t i) { array[i] = i; });
 }
 
-namespace detail {
-/** \brief Keeps track of global device memory allocations. Thread safe.*/
-class MemoryLogger {
-  // Information for a single device
-  struct DeviceStats {
-    size_t currently_allocated_bytes{ 0 };
-    size_t peak_allocated_bytes{ 0 };
-    size_t num_allocations{ 0 };
-    size_t num_deallocations{ 0 };
-    std::map<void *, size_t> device_allocations;
-    void RegisterAllocation(void *ptr, size_t n) {
-      device_allocations[ptr] = n;
-      currently_allocated_bytes += n;
-      peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes);
-      num_allocations++;
-      CHECK_GT(num_allocations, num_deallocations);
-    }
-    void RegisterDeallocation(void *ptr, size_t n, int current_device) {
-      auto itr = device_allocations.find(ptr);
-      if (itr == device_allocations.end()) {
-        LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device
-                     << " that was never allocated\n"
-                     << dmlc::StackTrace();
-      } else {
-        num_deallocations++;
-        CHECK_LE(num_deallocations, num_allocations);
-        currently_allocated_bytes -= itr->second;
-        device_allocations.erase(itr);
-      }
-    }
-  };
-  DeviceStats stats_;
-  std::mutex mutex_;
-
-public:
-  void RegisterAllocation(void *ptr, size_t n) {
-    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
-      return;
-    }
-    std::lock_guard<std::mutex> guard(mutex_);
-    int current_device;
-    safe_cuda(cudaGetDevice(&current_device));
-    stats_.RegisterAllocation(ptr, n);
-  }
-  void RegisterDeallocation(void *ptr, size_t n) {
-    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
-      return;
-    }
-    std::lock_guard<std::mutex> guard(mutex_);
-    int current_device;
-    safe_cuda(cudaGetDevice(&current_device));
-    stats_.RegisterDeallocation(ptr, n, current_device);
-  }
-  size_t PeakMemory() const {
-    return stats_.peak_allocated_bytes;
-  }
-  size_t CurrentlyAllocatedBytes() const {
-    return stats_.currently_allocated_bytes;
-  }
-  void Clear()
-  {
-    stats_ = DeviceStats();
-  }
-
-  void Log() {
-    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
-      return;
-    }
-    std::lock_guard<std::mutex> guard(mutex_);
-    int current_device;
-    safe_cuda(cudaGetDevice(&current_device));
-    LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
-      << " ========";
-    LOG(CONSOLE) << "Peak memory usage: "
-      << stats_.peak_allocated_bytes / 1048576 << "MiB";
-    LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
-  }
-};
-}  // namespace detail
-
-inline detail::MemoryLogger &GlobalMemoryLogger() {
-  static detail::MemoryLogger memory_logger;
-  return memory_logger;
-}
-
 // dh::DebugSyncDevice(__FILE__, __LINE__);
 inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
   if (file != "" && line != -1) {
@@ -380,134 +282,6 @@ inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
   safe_cuda(cudaGetLastError());
 }
 
-namespace detail {
-
-#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-template <typename T>
-using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator<T>;
-#else  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-template <typename T>
-using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
-#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-
-inline void ThrowOOMError(std::string const& err, size_t bytes) {
-  auto device = CurrentDevice();
-  auto rank = xgboost::collective::GetRank();
-  std::stringstream ss;
-  ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
-     << "- Free memory: " << AvailableMemory(device) << "\n"
-     << "- Requested memory: " << bytes << std::endl;
-  LOG(FATAL) << ss.str();
-}
-
-/**
- * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
- */
-template <class T>
-struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
-  using SuperT = XGBBaseDeviceAllocator<T>;
-  using pointer = thrust::device_ptr<T>;  // NOLINT
-  template<typename U>
-  struct rebind  // NOLINT
-  {
-    using other = XGBDefaultDeviceAllocatorImpl<U>;  // NOLINT
-  };
-  pointer allocate(size_t n) {  // NOLINT
-    pointer ptr;
-    try {
-      ptr = SuperT::allocate(n);
-      dh::safe_cuda(cudaGetLastError());
-    } catch (const std::exception &e) {
-      ThrowOOMError(e.what(), n * sizeof(T));
-    }
-    GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
-    return ptr;
-  }
-  void deallocate(pointer ptr, size_t n) {  // NOLINT
-    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
-    SuperT::deallocate(ptr, n);
-  }
-#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-  XGBDefaultDeviceAllocatorImpl()
-    : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {}
-#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-};
-
-/**
- * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless
- *        RMM pool allocator is enabled. Does not initialise memory on construction.
- */
-template <class T>
-struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
-  using SuperT = XGBBaseDeviceAllocator<T>;
-  using pointer = thrust::device_ptr<T>;  // NOLINT
-  template<typename U>
-  struct rebind  // NOLINT
-  {
-    using other = XGBCachingDeviceAllocatorImpl<U>;  // NOLINT
-  };
-  cub::CachingDeviceAllocator& GetGlobalCachingAllocator() {
-    // Configure allocator with maximum cached bin size of ~1GB and no limit on
-    // maximum cached bytes
-    thread_local std::unique_ptr<cub::CachingDeviceAllocator> allocator{
-        std::make_unique<cub::CachingDeviceAllocator>(2, 9, 29)};
-    return *allocator;
-  }
-  pointer allocate(size_t n) {  // NOLINT
-    pointer thrust_ptr;
-    if (use_cub_allocator_) {
-      T* raw_ptr{nullptr};
-      auto errc =  GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
-                                                              n * sizeof(T));
-      if (errc != cudaSuccess) {
-        ThrowOOMError("Caching allocator", n * sizeof(T));
-      }
-      thrust_ptr = pointer(raw_ptr);
-    } else {
-      try {
-        thrust_ptr = SuperT::allocate(n);
-        dh::safe_cuda(cudaGetLastError());
-      } catch (const std::exception &e) {
-        ThrowOOMError(e.what(), n * sizeof(T));
-      }
-    }
-    GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
-    return thrust_ptr;
-  }
-  void deallocate(pointer ptr, size_t n) {  // NOLINT
-    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
-    if (use_cub_allocator_) {
-      GetGlobalCachingAllocator().DeviceFree(ptr.get());
-    } else {
-      SuperT::deallocate(ptr, n);
-    }
-  }
-#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-  XGBCachingDeviceAllocatorImpl()
-      : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()),
-        use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
-#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-  XGBOOST_DEVICE void construct(T *) {}  // NOLINT
- private:
-  bool use_cub_allocator_{true};
-};
-}  // namespace detail
-
-// Declare xgboost allocators
-// Replacement of allocator with custom backend should occur here
-template <typename T>
-using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
-/*! Be careful that the initialization constructor is a no-op, which means calling
- *  `vec.resize(n)` won't initialize the memory region to 0. Instead use
- * `vec.resize(n, 0)`*/
-template <typename T>
-using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
-/** \brief Specialisation of thrust device vector using custom allocator. */
-template <typename T>
-using device_vector = thrust::device_vector<T,  XGBDeviceAllocator<T>>;  // NOLINT
-template <typename T>
-using caching_device_vector = thrust::device_vector<T,  XGBCachingDeviceAllocator<T>>;  // NOLINT
-
 // Faster to instantiate than caching_device_vector and invokes no synchronisation
 // Use this where vector functionality (e.g. resize) is not required
 template <typename T>
@@ -734,6 +508,11 @@ xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
   return ToSpan(vec, offset, size);
 }
 
+template <typename T>
+xgboost::common::Span<T> ToSpan(DeviceUVector<T> &vec) {
+  return {thrust::raw_pointer_cast(vec.data()), vec.size()};
+}
+
 // thrust begin, similiar to std::begin
 template <typename T>
 thrust::device_ptr<T> tbegin(xgboost::HostDeviceVector<T>& vector) {  // NOLINT
@@ -1117,6 +896,15 @@ class CUDAStream {
   void Sync() { this->View().Sync(); }
 };
 
+inline auto CachingThrustPolicy() {
+  XGBCachingDeviceAllocator<char> alloc;
+#if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
+  return thrust::cuda::par_nosync(alloc).on(DefaultStream());
+#else
+  return thrust::cuda::par(alloc).on(DefaultStream());
+#endif  // THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
+}
+
 // Force nvcc to load data as constant
 template <typename T>
 class LDGIterator {
diff --git a/src/common/device_vector.cu b/src/common/device_vector.cu
new file mode 100644
index 000000000000..50922d8f978e
--- /dev/null
+++ b/src/common/device_vector.cu
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2017-2024, XGBoost contributors
+ */
+#include "../collective/communicator-inl.h"  // for GetRank
+#include "device_helpers.cuh"                // for CurrentDevice
+#include "device_vector.cuh"
+
+namespace dh {
+namespace detail {
+void ThrowOOMError(std::string const &err, size_t bytes) {
+  auto device = CurrentDevice();
+  auto rank = xgboost::collective::GetRank();
+  std::stringstream ss;
+  ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
+     << "- Free memory: " << dh::AvailableMemory(device) << "\n"
+     << "- Requested memory: " << bytes << std::endl;
+  LOG(FATAL) << ss.str();
+}
+}  // namespace detail
+
+#if defined(XGBOOST_USE_RMM)
+LoggingResource *GlobalLoggingResource() {
+  static auto mr{std::make_unique<LoggingResource>()};
+  return mr.get();
+}
+#endif  // defined(XGBOOST_USE_RMM)
+}  // namespace dh
diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
new file mode 100644
index 000000000000..35386856cc9c
--- /dev/null
+++ b/src/common/device_vector.cuh
@@ -0,0 +1,330 @@
+/**
+ * Copyright 2017-2024, XGBoost Contributors
+ */
+#pragma once
+#include <thrust/device_malloc_allocator.h>  // for device_malloc_allocator
+#include <thrust/device_ptr.h>               // for device_ptr
+#include <thrust/device_vector.h>            // for device_vector
+
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+#include <rmm/device_uvector.hpp>                      // for device_uvector
+#include <rmm/exec_policy.hpp>                         // for exec_policy_nosync
+#include <rmm/mr/device/device_memory_resource.hpp>    // for device_memory_resource
+#include <rmm/mr/device/per_device_resource.hpp>       // for get_current_device_resource
+#include <rmm/mr/device/thrust_allocator_adaptor.hpp>  // for thrust_allocator
+#include <rmm/version_config.hpp>                      // for RMM_VERSION_MAJOR
+
+#include "xgboost/global_config.h"  // for GlobalConfigThreadLocalStore
+
+#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
+
+#error "Please use RMM version 0.18 or later"
+#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18
+#error "Please use RMM version 0.18 or later"
+#endif  // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
+
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+
+#include <cstddef>                 // for size_t
+#include <cub/util_allocator.cuh>  // for CachingDeviceAllocator
+#include <cub/util_device.cuh>     // for CurrentDevice
+#include <map>                     // for map
+#include <memory>                  // for unique_ptr
+
+#include "common.h"  // for safe_cuda
+#include "xgboost/logging.h"
+
+namespace dh {
+namespace detail {
+/** \brief Keeps track of global device memory allocations. Thread safe.*/
+class MemoryLogger {
+  // Information for a single device
+  struct DeviceStats {
+    std::size_t currently_allocated_bytes{0};
+    size_t peak_allocated_bytes{0};
+    size_t num_allocations{0};
+    size_t num_deallocations{0};
+    std::map<void *, size_t> device_allocations;
+    void RegisterAllocation(void *ptr, size_t n) {
+      device_allocations[ptr] = n;
+      currently_allocated_bytes += n;
+      peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes);
+      num_allocations++;
+      CHECK_GT(num_allocations, num_deallocations);
+    }
+    void RegisterDeallocation(void *ptr, size_t n, int current_device) {
+      auto itr = device_allocations.find(ptr);
+      if (itr == device_allocations.end()) {
+        LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device
+                     << " that was never allocated\n"
+                     << dmlc::StackTrace();
+      } else {
+        num_deallocations++;
+        CHECK_LE(num_deallocations, num_allocations);
+        currently_allocated_bytes -= itr->second;
+        device_allocations.erase(itr);
+      }
+    }
+  };
+  DeviceStats stats_;
+  std::mutex mutex_;
+
+ public:
+  void RegisterAllocation(void *ptr, size_t n) {
+    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
+      return;
+    }
+    std::lock_guard<std::mutex> guard(mutex_);
+    stats_.RegisterAllocation(ptr, n);
+  }
+  void RegisterDeallocation(void *ptr, size_t n) {
+    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
+      return;
+    }
+    std::lock_guard<std::mutex> guard(mutex_);
+    stats_.RegisterDeallocation(ptr, n, cub::CurrentDevice());
+  }
+  size_t PeakMemory() const { return stats_.peak_allocated_bytes; }
+  size_t CurrentlyAllocatedBytes() const { return stats_.currently_allocated_bytes; }
+  void Clear() { stats_ = DeviceStats(); }
+
+  void Log() {
+    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
+      return;
+    }
+    std::lock_guard<std::mutex> guard(mutex_);
+    int current_device;
+    dh::safe_cuda(cudaGetDevice(&current_device));
+    LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
+                 << " ========";
+    LOG(CONSOLE) << "Peak memory usage: " << stats_.peak_allocated_bytes / 1048576 << "MiB";
+    LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
+  }
+};
+
+void ThrowOOMError(std::string const &err, size_t bytes);
+}  // namespace detail
+
+inline detail::MemoryLogger &GlobalMemoryLogger() {
+  static detail::MemoryLogger memory_logger;
+  return memory_logger;
+}
+
+namespace detail {
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+template <typename T>
+using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator<T>;
+#else   // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+template <typename T>
+using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+
+/**
+ * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
+ */
+template <class T>
+struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
+  using SuperT = XGBBaseDeviceAllocator<T>;
+  using pointer = thrust::device_ptr<T>;  // NOLINT
+  template <typename U>
+  struct rebind  // NOLINT
+  {
+    using other = XGBDefaultDeviceAllocatorImpl<U>;  // NOLINT
+  };
+  pointer allocate(size_t n) {  // NOLINT
+    pointer ptr;
+    try {
+      ptr = SuperT::allocate(n);
+      dh::safe_cuda(cudaGetLastError());
+    } catch (const std::exception &e) {
+      detail::ThrowOOMError(e.what(), n * sizeof(T));
+    }
+    GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
+    return ptr;
+  }
+  void deallocate(pointer ptr, size_t n) {  // NOLINT
+    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
+    SuperT::deallocate(ptr, n);
+  }
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+  XGBDefaultDeviceAllocatorImpl()
+      : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {}
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+};
+
+/**
+ * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless
+ *        RMM pool allocator is enabled. Does not initialise memory on construction.
+ */
+template <class T>
+struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
+  using SuperT = XGBBaseDeviceAllocator<T>;
+  using pointer = thrust::device_ptr<T>;  // NOLINT
+  template <typename U>
+  struct rebind  // NOLINT
+  {
+    using other = XGBCachingDeviceAllocatorImpl<U>;  // NOLINT
+  };
+  cub::CachingDeviceAllocator &GetGlobalCachingAllocator() {
+    // Configure allocator with maximum cached bin size of ~1GB and no limit on
+    // maximum cached bytes
+    thread_local std::unique_ptr<cub::CachingDeviceAllocator> allocator{
+        std::make_unique<cub::CachingDeviceAllocator>(2, 9, 29)};
+    return *allocator;
+  }
+  pointer allocate(size_t n) {  // NOLINT
+    pointer thrust_ptr;
+    if (use_cub_allocator_) {
+      T *raw_ptr{nullptr};
+      auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
+                                                             n * sizeof(T));
+      if (errc != cudaSuccess) {
+        detail::ThrowOOMError("Caching allocator", n * sizeof(T));
+      }
+      thrust_ptr = pointer(raw_ptr);
+    } else {
+      try {
+        thrust_ptr = SuperT::allocate(n);
+        dh::safe_cuda(cudaGetLastError());
+      } catch (const std::exception &e) {
+        detail::ThrowOOMError(e.what(), n * sizeof(T));
+      }
+    }
+    GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
+    return thrust_ptr;
+  }
+  void deallocate(pointer ptr, size_t n) {  // NOLINT
+    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
+    if (use_cub_allocator_) {
+      GetGlobalCachingAllocator().DeviceFree(ptr.get());
+    } else {
+      SuperT::deallocate(ptr, n);
+    }
+  }
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+  XGBCachingDeviceAllocatorImpl()
+      : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()),
+        use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
+#endif                                   // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+  XGBOOST_DEVICE void construct(T *) {}  // NOLINT
+ private:
+  bool use_cub_allocator_{true};
+};
+}  // namespace detail
+
+// Declare xgboost allocators
+// Replacement of allocator with custom backend should occur here
+template <typename T>
+using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
+
+/** Be careful that the initialization constructor is a no-op, which means calling
+ *  `vec.resize(n)` won't initialize the memory region to 0. Instead use
+ * `vec.resize(n, 0)`
+ */
+template <typename T>
+using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
+
+/** @brief Specialisation of thrust device vector using custom allocator. */
+template <typename T>
+using device_vector = thrust::device_vector<T,  XGBDeviceAllocator<T>>;  // NOLINT
+template <typename T>
+using caching_device_vector = thrust::device_vector<T,  XGBCachingDeviceAllocator<T>>;  // NOLINT
+
+#if defined(XGBOOST_USE_RMM)
+/**
+ * @brief Similar to `rmm::logging_resource_adaptor`, but uses XGBoost memory logger instead.
+ */
+class LoggingResource : public rmm::mr::device_memory_resource {
+  rmm::mr::device_memory_resource *mr_{rmm::mr::get_current_device_resource()};
+
+ public:
+  LoggingResource() = default;
+  ~LoggingResource() override = default;
+  LoggingResource(LoggingResource const &) = delete;
+  LoggingResource &operator=(LoggingResource const &) = delete;
+  LoggingResource(LoggingResource &&) noexcept = default;
+  LoggingResource &operator=(LoggingResource &&) noexcept = default;
+
+  [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept {  // NOLINT
+    return mr_;
+  }
+  [[nodiscard]] rmm::mr::device_memory_resource *get_upstream() const noexcept {  // NOLINT
+    return mr_;
+  }
+
+  void *do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override {  // NOLINT
+    try {
+      auto const ptr = mr_->allocate(bytes, stream);
+      GlobalMemoryLogger().RegisterAllocation(ptr, bytes);
+      return ptr;
+    } catch (rmm::bad_alloc const &e) {
+      detail::ThrowOOMError(e.what(), bytes);
+    }
+    return nullptr;
+  }
+
+  void do_deallocate(void *ptr, std::size_t bytes,  // NOLINT
+                     rmm::cuda_stream_view stream) override {
+    mr_->deallocate(ptr, bytes, stream);
+    GlobalMemoryLogger().RegisterDeallocation(ptr, bytes);
+  }
+
+  [[nodiscard]] bool do_is_equal(  // NOLINT
+      device_memory_resource const &other) const noexcept override {
+    if (this == &other) {
+      return true;
+    }
+    auto const *cast = dynamic_cast<LoggingResource const *>(&other);
+    if (cast == nullptr) {
+      return mr_->is_equal(other);
+    }
+    return get_upstream_resource() == cast->get_upstream_resource();
+  }
+};
+
+LoggingResource *GlobalLoggingResource();
+
+/**
+ * @brief Container class that doesn't initialize the data.
+ */
+template <typename T>
+class DeviceUVector : public rmm::device_uvector<T> {
+  using Super = rmm::device_uvector<T>;
+
+ public:
+  DeviceUVector() : Super{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {}
+
+  void Resize(std::size_t n) { Super::resize(n, rmm::cuda_stream_per_thread); }
+  void Resize(std::size_t n, T const &v) {
+    auto orig = this->size();
+    Super::resize(n, rmm::cuda_stream_per_thread);
+    if (orig < n) {
+      thrust::fill(rmm::exec_policy_nosync{}, this->begin() + orig, this->end(), v);
+    }
+  }
+
+ private:
+  // undefined private, cannot be accessed.
+  void resize(std::size_t n, rmm::cuda_stream_view stream);  // NOLINT
+};
+
+#else
+
+/**
+ * @brief Without RMM, the initialization will happen.
+ */
+template <typename T>
+class DeviceUVector : public thrust::device_vector<T, XGBDeviceAllocator<T>> {
+  using Super = thrust::device_vector<T, XGBDeviceAllocator<T>>;
+
+ public:
+  void Resize(std::size_t n) { Super::resize(n); }
+  void Resize(std::size_t n, T const &v) { Super::resize(n, v); }
+
+ private:
+  // undefined private, cannot be accessed.
+  void resize(std::size_t n, T const &v = T{});  // NOLINT
+};
+
+#endif  // defined(XGBOOST_USE_RMM)
+}  // namespace dh
diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc
index f4973c0428f0..de9e0614a38e 100644
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -114,6 +114,11 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
   impl_->Vec().resize(new_size, v);
 }
 
+template <typename T>
+void HostDeviceVector<T>::Resize(size_t new_size) {
+  impl_->Vec().resize(new_size, T{});
+}
+
 template <typename T>
 void HostDeviceVector<T>::Fill(T v) {
   std::fill(HostVector().begin(), HostVector().end(), v);
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index 99448df21b7e..16a1aa027f09 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -1,16 +1,17 @@
 /**
- * Copyright 2017-2023 by XGBoost contributors
+ * Copyright 2017-2024, XGBoost contributors
  */
 #include <thrust/fill.h>
-#include <thrust/device_ptr.h>
 
 #include <algorithm>
+#include <cstddef>  // for size_t
 #include <cstdint>
 
+#include "device_helpers.cuh"
+#include "device_vector.cuh"  // for DeviceUVector
 #include "xgboost/data.h"
 #include "xgboost/host_device_vector.h"
-#include "xgboost/tree_model.h"
-#include "device_helpers.cuh"
+#include "xgboost/tree_model.h"  // for RegTree
 
 namespace xgboost {
 
@@ -28,7 +29,7 @@ class HostDeviceVectorImpl {
     if (device.IsCUDA()) {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
-      data_d_->resize(size, v);
+      data_d_->Resize(size, v);
     } else {
       data_h_.resize(size, v);
     }
@@ -66,22 +67,22 @@ class HostDeviceVectorImpl {
 
   T* DevicePointer() {
     LazySyncDevice(GPUAccess::kWrite);
-    return data_d_->data().get();
+    return thrust::raw_pointer_cast(data_d_->data());
   }
 
   const T* ConstDevicePointer() {
     LazySyncDevice(GPUAccess::kRead);
-    return data_d_->data().get();
+    return thrust::raw_pointer_cast(data_d_->data());
   }
 
   common::Span<T> DeviceSpan() {
     LazySyncDevice(GPUAccess::kWrite);
-    return {data_d_->data().get(), Size()};
+    return {this->DevicePointer(), Size()};
   }
 
   common::Span<const T> ConstDeviceSpan() {
     LazySyncDevice(GPUAccess::kRead);
-    return {data_d_->data().get(), Size()};
+    return {this->ConstDevicePointer(), Size()};
   }
 
   void Fill(T v) {  // NOLINT
@@ -91,7 +92,7 @@ class HostDeviceVectorImpl {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
       auto s_data = dh::ToSpan(*data_d_);
-      dh::LaunchN(data_d_->size(),
+      dh::LaunchN(data_d_->size(), dh::DefaultStream(),
                   [=] XGBOOST_DEVICE(size_t i) { s_data[i] = v; });
     }
   }
@@ -128,7 +129,7 @@ class HostDeviceVectorImpl {
 
   void Extend(HostDeviceVectorImpl* other) {
     auto ori_size = this->Size();
-    this->Resize(ori_size + other->Size(), T());
+    this->Resize(ori_size + other->Size(), T{});
     if (HostCanWrite() && other->HostCanRead()) {
       auto& h_vec = this->HostVector();
       auto& other_vec = other->HostVector();
@@ -138,10 +139,9 @@ class HostDeviceVectorImpl {
       auto ptr = other->ConstDevicePointer();
       SetDevice();
       CHECK_EQ(this->Device(), other->Device());
-      dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
-                                    ptr,
-                                    other->Size() * sizeof(T),
-                                    cudaMemcpyDeviceToDevice));
+      dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, ptr,
+                                    other->Size() * sizeof(T), cudaMemcpyDeviceToDevice,
+                                    dh::DefaultStream()));
     }
   }
 
@@ -171,17 +171,22 @@ class HostDeviceVectorImpl {
     }
   }
 
-  void Resize(size_t new_size, T v) {
-    if (new_size == Size()) { return; }
+  template <typename... U>
+  auto Resize(std::size_t new_size, U&&... args) {
+    if (new_size == Size()) {
+      return;
+    }
     if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
       // fast on-device resize
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
-      data_d_->resize(new_size, v);
+      auto old_size = data_d_->size();
+      data_d_->Resize(new_size, std::forward<U>(args)...);
     } else {
       // resize on host
       LazySyncHost(GPUAccess::kNone);
-      data_h_.resize(new_size, v);
+      auto old_size = data_h_.size();
+      data_h_.resize(new_size, std::forward<U>(args)...);
     }
   }
 
@@ -195,10 +200,8 @@ class HostDeviceVectorImpl {
     gpu_access_ = access;
     if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
     SetDevice();
-    dh::safe_cuda(cudaMemcpy(data_h_.data(),
-                             data_d_->data().get(),
-                             data_d_->size() * sizeof(T),
-                             cudaMemcpyDeviceToHost));
+    dh::safe_cuda(cudaMemcpy(data_h_.data(), thrust::raw_pointer_cast(data_d_->data()),
+                             data_d_->size() * sizeof(T), cudaMemcpyDeviceToHost));
   }
 
   void LazySyncDevice(GPUAccess access) {
@@ -211,10 +214,9 @@ class HostDeviceVectorImpl {
     // data is on the host
     LazyResizeDevice(data_h_.size());
     SetDevice();
-    dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(),
-                                  data_h_.data(),
-                                  data_d_->size() * sizeof(T),
-                                  cudaMemcpyHostToDevice));
+    dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), data_h_.data(),
+                                  data_d_->size() * sizeof(T), cudaMemcpyHostToDevice,
+                                  dh::DefaultStream()));
     gpu_access_ = access;
   }
 
@@ -229,7 +231,7 @@ class HostDeviceVectorImpl {
  private:
   DeviceOrd device_{DeviceOrd::CPU()};
   std::vector<T> data_h_{};
-  std::unique_ptr<dh::device_vector<T>> data_d_{};
+  std::unique_ptr<dh::DeviceUVector<T>> data_d_{};
   GPUAccess gpu_access_{GPUAccess::kNone};
 
   void CopyToDevice(HostDeviceVectorImpl* other) {
@@ -239,8 +241,10 @@ class HostDeviceVectorImpl {
       LazyResizeDevice(Size());
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
-      dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
-                                    data_d_->size() * sizeof(T), cudaMemcpyDefault));
+      dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()),
+                                    thrust::raw_pointer_cast(other->data_d_->data()),
+                                    data_d_->size() * sizeof(T), cudaMemcpyDefault,
+                                    dh::DefaultStream()));
     }
   }
 
@@ -248,14 +252,15 @@ class HostDeviceVectorImpl {
     LazyResizeDevice(Size());
     gpu_access_ = GPUAccess::kWrite;
     SetDevice();
-    dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
-                                  data_d_->size() * sizeof(T), cudaMemcpyDefault));
+    dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), begin,
+                                  data_d_->size() * sizeof(T), cudaMemcpyDefault,
+                                  dh::DefaultStream()));
   }
 
   void LazyResizeDevice(size_t new_size) {
     if (data_d_ && new_size == data_d_->size()) { return; }
     SetDevice();
-    data_d_->resize(new_size);
+    data_d_->Resize(new_size);
   }
 
   void SetDevice() {
@@ -267,7 +272,7 @@ class HostDeviceVectorImpl {
     }
 
     if (!data_d_) {
-      data_d_.reset(new dh::device_vector<T>);
+      data_d_.reset(new dh::DeviceUVector<T>{});
     }
   }
 };
@@ -397,7 +402,12 @@ void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
 }
 
 template <typename T>
-void HostDeviceVector<T>::Resize(size_t new_size, T v) {
+void HostDeviceVector<T>::Resize(std::size_t new_size) {
+  impl_->Resize(new_size);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Resize(std::size_t new_size, T v) {
   impl_->Resize(new_size, v);
 }
 
@@ -427,5 +437,4 @@ template class HostDeviceVector<RTreeNodeStat>;
  */
 template class HostDeviceVector<std::size_t>;
 #endif  // defined(__APPLE__)
-
 }  // namespace xgboost
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index 898da03a0dce..3dd393755852 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -4,12 +4,14 @@
 #ifndef XGBOOST_COMMON_QUANTILE_CUH_
 #define XGBOOST_COMMON_QUANTILE_CUH_
 
-#include "xgboost/span.h"
-#include "xgboost/data.h"
+#include <thrust/logical.h>  // for any_of
+
+#include "categorical.h"
 #include "device_helpers.cuh"
 #include "quantile.h"
 #include "timer.h"
-#include "categorical.h"
+#include "xgboost/data.h"
+#include "xgboost/span.h"
 
 namespace xgboost {
 namespace common {
@@ -100,9 +102,9 @@ class SketchContainer {
     CHECK(device.IsCUDA());
     // Initialize Sketches for this dmatrix
     this->columns_ptr_.SetDevice(device_);
-    this->columns_ptr_.Resize(num_columns + 1);
+    this->columns_ptr_.Resize(num_columns + 1, 0);
     this->columns_ptr_b_.SetDevice(device_);
-    this->columns_ptr_b_.Resize(num_columns + 1);
+    this->columns_ptr_b_.Resize(num_columns + 1, 0);
 
     this->feature_types_.Resize(feature_types.Size());
     this->feature_types_.Copy(feature_types);
diff --git a/src/metric/auc.cu b/src/metric/auc.cu
index 59199b092839..4155a7084481 100644
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -1,7 +1,8 @@
 /**
  * Copyright 2021-2024, XGBoost Contributors
  */
-#include <thrust/copy.h>  // for copy
+#include <thrust/copy.h>     // for copy
+#include <thrust/logical.h>  // for any_of
 #include <thrust/scan.h>
 
 #include <cassert>
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 366cf3aad08e..5278b328acbc 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -841,9 +841,7 @@ class GPUHistMaker : public TreeUpdater {
     out["hist_train_param"] = ToJson(hist_maker_param_);
   }
 
-  ~GPUHistMaker() {  // NOLINT
-    dh::GlobalMemoryLogger().Log();
-  }
+  ~GPUHistMaker() override { dh::GlobalMemoryLogger().Log(); }
 
   void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu
new file mode 100644
index 000000000000..95da4ef3f167
--- /dev/null
+++ b/tests/cpp/common/test_device_vector.cu
@@ -0,0 +1,21 @@
+/**
+ * Copyright 2024, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#include "../../../src/common/device_vector.cuh"
+#include "xgboost/global_config.h"  // for GlobalConfigThreadLocalStore
+
+namespace dh {
+TEST(DeviceUVector, Basic) {
+  GlobalMemoryLogger().Clear();
+  std::int32_t verbosity{3};
+  std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
+  DeviceUVector<float> uvec;
+  uvec.Resize(12);
+  auto peak = GlobalMemoryLogger().PeakMemory();
+  auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size();
+  ASSERT_EQ(peak, n_bytes);
+  std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
+}
+}  // namespace dh
diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu
index 57e945cba9be..a0aa5fa11fce 100644
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2018-2023 XGBoost contributors
+ * Copyright 2018-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/equal.h>
@@ -181,4 +181,41 @@ TEST(HostDeviceVector, Empty) {
   ASSERT_FALSE(another.Empty());
   ASSERT_TRUE(vec.Empty());
 }
+
+TEST(HostDeviceVector, Resize) {
+  auto check = [&](HostDeviceVector<float> const& vec) {
+    auto const& h_vec = vec.ConstHostSpan();
+    for (std::size_t i = 0; i < 4; ++i) {
+      ASSERT_EQ(h_vec[i], i + 1);
+    }
+    for (std::size_t i = 4; i < vec.Size(); ++i) {
+      ASSERT_EQ(h_vec[i], 3.0);
+    }
+  };
+  {
+    HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
+    vec.SetDevice(DeviceOrd::CUDA(0));
+    vec.ConstDeviceSpan();
+    ASSERT_TRUE(vec.DeviceCanRead());
+    ASSERT_FALSE(vec.DeviceCanWrite());
+    vec.DeviceSpan();
+    vec.Resize(7, 3.0f);
+    ASSERT_TRUE(vec.DeviceCanWrite());
+    check(vec);
+  }
+  {
+    HostDeviceVector<float> vec{{1.0f, 2.0f, 3.0f, 4.0f}, DeviceOrd::CUDA(0)};
+    ASSERT_TRUE(vec.DeviceCanWrite());
+    vec.Resize(7, 3.0f);
+    ASSERT_TRUE(vec.DeviceCanWrite());
+    check(vec);
+  }
+  {
+    HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
+    ASSERT_TRUE(vec.HostCanWrite());
+    vec.Resize(7, 3.0f);
+    ASSERT_TRUE(vec.HostCanWrite());
+    check(vec);
+  }
+}
 }  // namespace xgboost::common
diff --git a/tests/cpp/data/test_array_interface.h b/tests/cpp/data/test_array_interface.h
index 78bce76f53e7..dfe4f5a3ec5c 100644
--- a/tests/cpp/data/test_array_interface.h
+++ b/tests/cpp/data/test_array_interface.h
@@ -1,15 +1,14 @@
-// Copyright (c) 2019 by Contributors
+/**
+ * Copyright 2019-2024, XGBoost Contributors
+ */
 #include <gtest/gtest.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>  // for device
+#include <thrust/sequence.h>          // for sequence
 #include <xgboost/data.h>
 #include <xgboost/json.h>
-#include <thrust/device_vector.h>
-
-#include <memory>
-#include "../../../src/common/bitfield.h"
-#include "../../../src/common/device_helpers.cuh"
 
 namespace xgboost {
-
 template <typename T>
 Json GenerateDenseColumn(std::string const& typestr, size_t kRows,
                          thrust::device_vector<T>* out_d_data) {

From 6c403187eccb5d00c7b2fcbd37e9293ccbd549c2 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 12 Jul 2024 01:07:12 +0800
Subject: [PATCH 20/51] Fix column split race condition. (#10572)

---
 src/tree/common_row_partitioner.h    | 48 ++++++++++++++---
 tests/cpp/tree/test_approx.cc        | 23 ++++++++
 tests/cpp/tree/test_column_split.h   | 79 ++++++++++++++++++++++++++++
 tests/cpp/tree/test_histmaker.cc     | 79 ++--------------------------
 tests/cpp/tree/test_quantile_hist.cc | 71 +++++++------------------
 5 files changed, 167 insertions(+), 133 deletions(-)
 create mode 100644 tests/cpp/tree/test_column_split.h

diff --git a/src/tree/common_row_partitioner.h b/src/tree/common_row_partitioner.h
index cd267673b66c..ff75000dfd45 100644
--- a/src/tree/common_row_partitioner.h
+++ b/src/tree/common_row_partitioner.h
@@ -36,10 +36,11 @@ class ColumnSplitHelper {
                     common::PartitionBuilder<kPartitionBlockSize>* partition_builder,
                     common::RowSetCollection* row_set_collection)
       : partition_builder_{partition_builder}, row_set_collection_{row_set_collection} {
-    decision_storage_.resize(num_row);
-    decision_bits_ = BitVector(common::Span<BitVector::value_type>(decision_storage_));
-    missing_storage_.resize(num_row);
-    missing_bits_ = BitVector(common::Span<BitVector::value_type>(missing_storage_));
+    auto n_bytes = BitVector::ComputeStorageSize(num_row);
+    decision_storage_.resize(n_bytes);
+    decision_bits_ = BitVector{common::Span<BitVector::value_type>{decision_storage_}};
+    missing_storage_.resize(n_bytes);
+    missing_bits_ = BitVector{common::Span<BitVector::value_type>{missing_storage_}};
   }
 
   template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
@@ -51,14 +52,43 @@ class ColumnSplitHelper {
     // we first collect all the decisions and whether the feature is missing into bit vectors.
     std::fill(decision_storage_.begin(), decision_storage_.end(), 0);
     std::fill(missing_storage_.begin(), missing_storage_.end(), 0);
-    common::ParallelFor2d(space, n_threads, [&](size_t node_in_set, common::Range1d r) {
-      const int32_t nid = nodes[node_in_set].nid;
+
+    this->tloc_decision_.resize(decision_storage_.size() * n_threads);
+    this->tloc_missing_.resize(decision_storage_.size() * n_threads);
+    std::fill_n(this->tloc_decision_.data(), this->tloc_decision_.size(), 0);
+    std::fill_n(this->tloc_missing_.data(), this->tloc_missing_.size(), 0);
+
+    // Make thread-local storage.
+    using T = decltype(decision_storage_)::value_type;
+    auto make_tloc = [&](std::vector<T>& storage, std::int32_t tidx) {
+      auto span = common::Span<T>{storage};
+      auto n = decision_storage_.size();
+      auto bitvec = BitVector{span.subspan(n * tidx, n)};
+      return bitvec;
+    };
+
+    common::ParallelFor2d(space, n_threads, [&](std::size_t node_in_set, common::Range1d r) {
+      bst_node_t const nid = nodes[node_in_set].nid;
+      auto tidx = omp_get_thread_num();
+      auto decision = make_tloc(this->tloc_decision_, tidx);
+      auto missing = make_tloc(this->tloc_missing_, tidx);
       bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0;
       partition_builder_->MaskRows<BinIdxType, any_missing, any_cat>(
           node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree,
-          (*row_set_collection_)[nid].begin(), &decision_bits_, &missing_bits_);
+          (*row_set_collection_)[nid].begin(), &decision, &missing);
     });
 
+    // Reduce thread local
+    auto decision = make_tloc(this->tloc_decision_, 0);
+    auto missing = make_tloc(this->tloc_missing_, 0);
+    for (std::int32_t tidx = 1; tidx < n_threads; ++tidx) {
+      decision |= make_tloc(this->tloc_decision_, tidx);
+      missing |= make_tloc(this->tloc_missing_, tidx);
+    }
+    CHECK_EQ(decision_storage_.size(), decision.NumValues());
+    std::copy_n(decision.Data(), decision_storage_.size(), decision_storage_.data());
+    std::copy_n(missing.Data(), missing_storage_.size(), missing_storage_.data());
+
     // Then aggregate the bit vectors across all the workers.
     auto rc = collective::Success() << [&] {
       return collective::Allreduce(ctx, &decision_storage_, collective::Op::kBitwiseOR);
@@ -85,6 +115,10 @@ class ColumnSplitHelper {
   BitVector decision_bits_{};
   std::vector<BitVector::value_type> missing_storage_{};
   BitVector missing_bits_{};
+
+  std::vector<BitVector::value_type> tloc_decision_;
+  std::vector<BitVector::value_type> tloc_missing_;
+
   common::PartitionBuilder<kPartitionBlockSize>* partition_builder_;
   common::RowSetCollection* row_set_collection_;
 };
diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc
index d647d3a970bf..8f28bfa218c8 100644
--- a/tests/cpp/tree/test_approx.cc
+++ b/tests/cpp/tree/test_approx.cc
@@ -6,6 +6,7 @@
 #include "../../../src/tree/common_row_partitioner.h"
 #include "../collective/test_worker.h"  // for TestDistributedGlobal
 #include "../helpers.h"
+#include "test_column_split.h"  // for TestColumnSplit
 #include "test_partitioner.h"
 
 namespace xgboost::tree {
@@ -154,4 +155,26 @@ TEST(Approx, PartitionerColSplit) {
                                mid_partitioner);
   });
 }
+
+namespace {
+class TestApproxColSplit : public ::testing::TestWithParam<std::tuple<bool, float>> {
+ public:
+  void Run() {
+    auto [categorical, sparsity] = GetParam();
+    TestColumnSplit(1u, categorical, "grow_histmaker", sparsity);
+  }
+};
+}  // namespace
+
+TEST_P(TestApproxColSplit, Basic) { this->Run(); }
+
+INSTANTIATE_TEST_SUITE_P(ColumnSplit, TestApproxColSplit, ::testing::ValuesIn([]() {
+                           std::vector<std::tuple<bool, float>> params;
+                           for (auto categorical : {true, false}) {
+                             for (auto sparsity : {0.0f, 0.6f}) {
+                               params.emplace_back(categorical, sparsity);
+                             }
+                           }
+                           return params;
+                         }()));
 }  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_column_split.h b/tests/cpp/tree/test_column_split.h
new file mode 100644
index 000000000000..b03597f38681
--- /dev/null
+++ b/tests/cpp/tree/test_column_split.h
@@ -0,0 +1,79 @@
+/**
+ * Copyright 2023-2024, XGBoost Contributors
+ */
+#pragma once
+
+#include <xgboost/data.h>          // for FeatureType, DMatrix
+#include <xgboost/tree_model.h>    // for RegTree
+#include <xgboost/tree_updater.h>  // for TreeUpdater
+
+#include <cstddef>  // for size_t
+#include <memory>   // for shared_ptr
+#include <vector>   // for vector
+
+#include "../../../src/tree/param.h"    // for TrainParam
+#include "../collective/test_worker.h"  // for TestDistributedGlobal
+#include "../helpers.h"                 // for RandomDataGenerator
+
+namespace xgboost::tree {
+inline std::shared_ptr<DMatrix> GenerateCatDMatrix(std::size_t rows, std::size_t cols,
+                                                   float sparsity, bool categorical) {
+  if (categorical) {
+    std::vector<FeatureType> ft(cols);
+    for (size_t i = 0; i < ft.size(); ++i) {
+      ft[i] = (i % 3 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical;
+    }
+    return RandomDataGenerator(rows, cols, 0.6f).Seed(3).Type(ft).MaxCategory(17).GenerateDMatrix();
+  } else {
+    return RandomDataGenerator{rows, cols, 0.6f}.Seed(3).GenerateDMatrix();
+  }
+}
+
+inline void TestColumnSplit(bst_target_t n_targets, bool categorical, std::string name,
+                            float sparsity) {
+  auto constexpr kRows = 32;
+  auto constexpr kCols = 16;
+
+  RegTree expected_tree{n_targets, static_cast<bst_feature_t>(kCols)};
+  ObjInfo task{ObjInfo::kRegression};
+  Context ctx;
+  {
+    auto p_dmat = GenerateCatDMatrix(kRows, kCols, sparsity, categorical);
+    auto gpair = GenerateRandomGradients(&ctx, kRows, n_targets);
+    std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create(name, &ctx, &task)};
+    std::vector<HostDeviceVector<bst_node_t>> position(1);
+    TrainParam param;
+    param.Init(Args{});
+    updater->Configure(Args{});
+    updater->Update(&param, &gpair, p_dmat.get(), position, {&expected_tree});
+  }
+
+  auto verify = [&] {
+    Context ctx;
+    auto p_dmat = GenerateCatDMatrix(kRows, kCols, sparsity, categorical);
+    auto gpair = GenerateRandomGradients(&ctx, kRows, n_targets);
+
+    ObjInfo task{ObjInfo::kRegression};
+    std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create(name, &ctx, &task)};
+    std::vector<HostDeviceVector<bst_node_t>> position(1);
+
+    std::unique_ptr<DMatrix> sliced{
+        p_dmat->SliceCol(collective::GetWorldSize(), collective::GetRank())};
+
+    RegTree tree{n_targets, static_cast<bst_feature_t>(kCols)};
+    TrainParam param;
+    param.Init(Args{});
+    updater->Configure(Args{});
+    updater->Update(&param, &gpair, sliced.get(), position, {&tree});
+
+    Json json{Object{}};
+    tree.SaveModel(&json);
+    Json expected_json{Object{}};
+    expected_tree.SaveModel(&expected_json);
+    ASSERT_EQ(json, expected_json);
+  };
+
+  auto constexpr kWorldSize = 2;
+  collective::TestDistributedGlobal(kWorldSize, [&] { verify(); });
+}
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_histmaker.cc b/tests/cpp/tree/test_histmaker.cc
index b8b9e46cac18..888790aa7c3c 100644
--- a/tests/cpp/tree/test_histmaker.cc
+++ b/tests/cpp/tree/test_histmaker.cc
@@ -1,32 +1,19 @@
 /**
- * Copyright 2019-2023 by XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/tree_model.h>
 #include <xgboost/tree_updater.h>
 
-#include "../../../src/tree/param.h"    // for TrainParam
-#include "../collective/test_worker.h"  // for TestDistributedGlobal
+#include "../../../src/tree/param.h"  // for TrainParam
 #include "../helpers.h"
+#include "test_column_split.h"  // for GenerateCatDMatrix
 
 namespace xgboost::tree {
-std::shared_ptr<DMatrix> GenerateDMatrix(std::size_t rows, std::size_t cols,
-                                         bool categorical = false) {
-  if (categorical) {
-    std::vector<FeatureType> ft(cols);
-    for (size_t i = 0; i < ft.size(); ++i) {
-      ft[i] = (i % 3 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical;
-    }
-    return RandomDataGenerator(rows, cols, 0.6f).Seed(3).Type(ft).MaxCategory(17).GenerateDMatrix();
-  } else {
-    return RandomDataGenerator{rows, cols, 0.6f}.Seed(3).GenerateDMatrix();
-  }
-}
-
 TEST(GrowHistMaker, InteractionConstraint) {
   auto constexpr kRows = 32;
   auto constexpr kCols = 16;
-  auto p_dmat = GenerateDMatrix(kRows, kCols);
+  auto p_dmat = GenerateCatDMatrix(kRows, kCols, 0.0, false);
   Context ctx;
 
   linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
@@ -69,62 +56,4 @@ TEST(GrowHistMaker, InteractionConstraint) {
     ASSERT_NE(tree[tree[0].RightChild()].SplitIndex(), 0);
   }
 }
-
-namespace {
-void VerifyColumnSplit(int32_t rows, bst_feature_t cols, bool categorical,
-                       RegTree const& expected_tree) {
-  Context ctx;
-  auto p_dmat = GenerateDMatrix(rows, cols, categorical);
-  linalg::Matrix<GradientPair> gpair({rows}, ctx.Device());
-  gpair.Data()->Copy(GenerateRandomGradients(rows));
-
-
-  ObjInfo task{ObjInfo::kRegression};
-  std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
-  std::vector<HostDeviceVector<bst_node_t>> position(1);
-
-  std::unique_ptr<DMatrix> sliced{
-      p_dmat->SliceCol(collective::GetWorldSize(), collective::GetRank())};
-
-  RegTree tree{1u, cols};
-  TrainParam param;
-  param.Init(Args{});
-  updater->Configure(Args{});
-  updater->Update(&param, &gpair, sliced.get(), position, {&tree});
-
-  Json json{Object{}};
-  tree.SaveModel(&json);
-  Json expected_json{Object{}};
-  expected_tree.SaveModel(&expected_json);
-  ASSERT_EQ(json, expected_json);
-}
-
-void TestColumnSplit(bool categorical) {
-  auto constexpr kRows = 32;
-  auto constexpr kCols = 16;
-
-  RegTree expected_tree{1u, kCols};
-  ObjInfo task{ObjInfo::kRegression};
-  {
-    Context ctx;
-    auto p_dmat = GenerateDMatrix(kRows, kCols, categorical);
-    linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
-    gpair.Data()->Copy(GenerateRandomGradients(kRows));
-    std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
-    std::vector<HostDeviceVector<bst_node_t>> position(1);
-    TrainParam param;
-    param.Init(Args{});
-    updater->Configure(Args{});
-    updater->Update(&param, &gpair, p_dmat.get(), position, {&expected_tree});
-  }
-
-  auto constexpr kWorldSize = 2;
-  collective::TestDistributedGlobal(
-      kWorldSize, [&] { VerifyColumnSplit(kRows, kCols, categorical, expected_tree); });
-}
-}  // anonymous namespace
-
-TEST(GrowHistMaker, ColumnSplitNumerical) { TestColumnSplit(false); }
-
-TEST(GrowHistMaker, ColumnSplitCategorical) { TestColumnSplit(true); }
 }  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index 29ae02f8d2b2..74fd6ec5ff79 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -11,9 +11,9 @@
 
 #include "../../../src/tree/common_row_partitioner.h"
 #include "../../../src/tree/hist/expand_entry.h"  // for MultiExpandEntry, CPUExpandEntry
-#include "../../../src/tree/param.h"
 #include "../collective/test_worker.h"  // for TestDistributedGlobal
 #include "../helpers.h"
+#include "test_column_split.h"  // for TestColumnSplit
 #include "test_partitioner.h"
 #include "xgboost/data.h"
 
@@ -208,57 +208,26 @@ TEST(QuantileHist, PartitionerColSplit) { TestColumnSplitPartitioner<CPUExpandEn
 TEST(QuantileHist, MultiPartitionerColSplit) { TestColumnSplitPartitioner<MultiExpandEntry>(3); }
 
 namespace {
-void VerifyColumnSplit(Context const* ctx, bst_idx_t rows, bst_feature_t cols, bst_target_t n_targets,
-                       RegTree const& expected_tree) {
-  auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
-  linalg::Matrix<GradientPair> gpair = GenerateRandomGradients(ctx, rows, n_targets);
-
-  ObjInfo task{ObjInfo::kRegression};
-  std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_quantile_histmaker", ctx, &task)};
-  std::vector<HostDeviceVector<bst_node_t>> position(1);
-
-  std::unique_ptr<DMatrix> sliced{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
-
-  RegTree tree{n_targets, cols};
-  TrainParam param;
-  param.Init(Args{});
-  updater->Configure(Args{});
-  updater->Update(&param, &gpair, sliced.get(), position, {&tree});
-
-  Json json{Object{}};
-  tree.SaveModel(&json);
-  Json expected_json{Object{}};
-  expected_tree.SaveModel(&expected_json);
-  ASSERT_EQ(json, expected_json);
-}
-
-void TestColumnSplit(bst_target_t n_targets) {
-  auto constexpr kRows = 32;
-  auto constexpr kCols = 16;
-
-  RegTree expected_tree{n_targets, kCols};
-  ObjInfo task{ObjInfo::kRegression};
-  Context ctx;
-  {
-    auto Xy = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
-    auto gpair = GenerateRandomGradients(&ctx, kRows, n_targets);
-    std::unique_ptr<TreeUpdater> updater{
-        TreeUpdater::Create("grow_quantile_histmaker", &ctx, &task)};
-    std::vector<HostDeviceVector<bst_node_t>> position(1);
-    TrainParam param;
-    param.Init(Args{});
-    updater->Configure(Args{});
-    updater->Update(&param, &gpair, Xy.get(), position, {&expected_tree});
+class TestHistColSplit : public ::testing::TestWithParam<std::tuple<bst_target_t, bool, float>> {
+ public:
+  void Run() {
+    auto [n_targets, categorical, sparsity] = GetParam();
+    TestColumnSplit(n_targets, categorical, "grow_quantile_histmaker", sparsity);
   }
-
-  auto constexpr kWorldSize = 2;
-  collective::TestDistributedGlobal(kWorldSize, [&] {
-    VerifyColumnSplit(&ctx, kRows, kCols, n_targets, std::cref(expected_tree));
-  });
-}
+};
 }  // anonymous namespace
 
-TEST(QuantileHist, ColumnSplit) { TestColumnSplit(1); }
-
-TEST(QuantileHist, ColumnSplitMultiTarget) { TestColumnSplit(3); }
+TEST_P(TestHistColSplit, Basic) { this->Run(); }
+
+INSTANTIATE_TEST_SUITE_P(ColumnSplit, TestHistColSplit, ::testing::ValuesIn([]() {
+                           std::vector<std::tuple<bst_target_t, bool, float>> params;
+                           for (auto categorical : {true, false}) {
+                             for (auto sparsity : {0.0f, 0.6f}) {
+                               for (bst_target_t n_targets : {1u, 3u}) {
+                                 params.emplace_back(n_targets, categorical, sparsity);
+                               }
+                             }
+                           }
+                           return params;
+                         }()));
 }  // namespace xgboost::tree

From 5fea9d24f2e286510178e2ba9a55829e695356e8 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 12 Jul 2024 05:18:23 +0800
Subject: [PATCH 21/51] Small cleanup for CMake scripts. (#10573)

- Remove rabit.
---
 src/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 49290179a899..51ce25607fa0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -9,7 +9,6 @@ target_sources(objxgboost PRIVATE ${CPU_SOURCES})
 set_source_files_properties(
   predictor/predictor.cc gbm/gbm.cc tree/tree_updater.cc metric/metric.cc objective/objective.cc
   PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON)
-target_sources(objxgboost PRIVATE ${RABIT_SOURCES})
 
 if(USE_CUDA)
   file(GLOB_RECURSE CUDA_SOURCES *.cu *.cuh)
@@ -23,8 +22,7 @@ endif()
 target_include_directories(objxgboost
   PRIVATE
   ${xgboost_SOURCE_DIR}/include
-  ${xgboost_SOURCE_DIR}/dmlc-core/include
-  ${xgboost_SOURCE_DIR}/rabit/include)
+  ${xgboost_SOURCE_DIR}/dmlc-core/include)
 
 if(LOG_CAPI_INVOCATION)
   target_compile_definitions(objxgboost PRIVATE -DLOG_CAPI_INVOCATION=1)

From ce97de2a7c3958650c033f6c0fed8ac4a48b69f5 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <d.razdoburdin@gmail.com>
Date: Fri, 12 Jul 2024 12:28:54 +0200
Subject: [PATCH 22/51] replace channel for sycl dependencies (#10576)

Co-authored-by: Dmitry Razdoburdin <>
---
 tests/ci_build/conda_env/linux_sycl_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ci_build/conda_env/linux_sycl_test.yml b/tests/ci_build/conda_env/linux_sycl_test.yml
index 7335b7f20fd5..e82a6bed62f5 100644
--- a/tests/ci_build/conda_env/linux_sycl_test.yml
+++ b/tests/ci_build/conda_env/linux_sycl_test.yml
@@ -1,7 +1,7 @@
 name: linux_sycl_test
 channels:
 - conda-forge
-- intel
+- https://software.repos.intel.com/python/conda/
 dependencies:
 - python=3.8
 - cmake

From 6fc10885920a734e261a5eb7bed44b0fa015f60f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 14 Jul 2024 15:25:52 +0800
Subject: [PATCH 23/51] Bump
 org.apache.maven.plugins:maven-project-info-reports-plugin (#10497)

Bumps [org.apache.maven.plugins:maven-project-info-reports-plugin](https://github.com/apache/maven-project-info-reports-plugin) from 3.5.0 to 3.6.1.
- [Commits](https://github.com/apache/maven-project-info-reports-plugin/compare/maven-project-info-reports-plugin-3.5.0...maven-project-info-reports-plugin-3.6.1)

---
updated-dependencies:
- dependency-name: org.apache.maven.plugins:maven-project-info-reports-plugin
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index c6f0fe45ef6c..6a3e902577fc 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -470,7 +470,7 @@
         <plugins>
             <plugin>
                 <artifactId>maven-project-info-reports-plugin</artifactId>
-                <version>3.5.0</version>
+                <version>3.6.1</version>
             </plugin>
             <plugin>
                 <groupId>net.alchim31.maven</groupId>

From 5b7c68946da673cefb00602ab4917c275e2688a3 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 14 Jul 2024 15:26:17 +0800
Subject: [PATCH 24/51] Bump org.apache.flink:flink-clients in /jvm-packages
 (#10517)

Bumps [org.apache.flink:flink-clients](https://github.com/apache/flink) from 1.19.0 to 1.19.1.
- [Commits](https://github.com/apache/flink/compare/release-1.19.0...release-1.19.1)

---
updated-dependencies:
- dependency-name: org.apache.flink:flink-clients
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 6a3e902577fc..4ae3f1a4ad0f 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -33,7 +33,7 @@
         <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
         <maven.compiler.source>1.8</maven.compiler.source>
         <maven.compiler.target>1.8</maven.compiler.target>
-        <flink.version>1.19.0</flink.version>
+        <flink.version>1.19.1</flink.version>
         <junit.version>4.13.2</junit.version>
         <spark.version>3.5.1</spark.version>
         <spark.version.gpu>3.5.1</spark.version.gpu>

From 7996914a2d6f44eff280c7df6712de0a7070ded5 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 14 Jul 2024 17:27:17 +0800
Subject: [PATCH 25/51] Bump org.apache.maven.plugins:maven-surefire-plugin
 (#10429)

Bumps [org.apache.maven.plugins:maven-surefire-plugin](https://github.com/apache/maven-surefire) from 3.2.5 to 3.3.0.
- [Release notes](https://github.com/apache/maven-surefire/releases)
- [Commits](https://github.com/apache/maven-surefire/compare/surefire-3.2.5...surefire-3.3.0)

---
updated-dependencies:
- dependency-name: org.apache.maven.plugins:maven-surefire-plugin
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 4ae3f1a4ad0f..f914cdbd0d19 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -447,7 +447,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-surefire-plugin</artifactId>
-                <version>3.2.5</version>
+                <version>3.3.1</version>
                 <configuration>
                     <skipTests>false</skipTests>
                     <useSystemClassLoader>false</useSystemClassLoader>

From 8b77964d035c6f6796fe48ab91c79c1301244b01 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 15 Jul 2024 15:04:30 +0800
Subject: [PATCH 26/51] Bump commons-logging:commons-logging in
 /jvm-packages/xgboost4j-spark (#10547)

Bumps commons-logging:commons-logging from 1.3.2 to 1.3.3.

---
updated-dependencies:
- dependency-name: commons-logging:commons-logging
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index f914cdbd0d19..48612b7b1812 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -494,7 +494,7 @@
         <dependency>
             <groupId>commons-logging</groupId>
             <artifactId>commons-logging</artifactId>
-            <version>1.3.2</version>
+            <version>1.3.3</version>
         </dependency>
         <dependency>
             <groupId>org.scalatest</groupId>

From 0f789e2b2283d8e1625f766914740a5382d4ae4c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 15 Jul 2024 15:15:20 +0800
Subject: [PATCH 27/51] Bump org.apache.maven.plugins:maven-jar-plugin (#10458)

Bumps [org.apache.maven.plugins:maven-jar-plugin](https://github.com/apache/maven-jar-plugin) from 3.4.1 to 3.4.2.
- [Release notes](https://github.com/apache/maven-jar-plugin/releases)
- [Commits](https://github.com/apache/maven-jar-plugin/compare/maven-jar-plugin-3.4.1...maven-jar-plugin-3.4.2)

---
updated-dependencies:
- dependency-name: org.apache.maven.plugins:maven-jar-plugin
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml           | 2 +-
 jvm-packages/xgboost4j/pom.xml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 48612b7b1812..74324d0bd3fb 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -125,7 +125,7 @@
                     <plugin>
                         <groupId>org.apache.maven.plugins</groupId>
                         <artifactId>maven-jar-plugin</artifactId>
-                        <version>3.4.1</version>
+                        <version>3.4.2</version>
                         <executions>
                             <execution>
                                 <id>empty-javadoc-jar</id>
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index acb724050f13..345098327f5c 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -106,7 +106,7 @@
           <plugin>
               <groupId>org.apache.maven.plugins</groupId>
               <artifactId>maven-jar-plugin</artifactId>
-              <version>3.4.1</version>
+              <version>3.4.2</version>
               <executions>
                   <execution>
                       <goals>

From 5b68b6837916a51f2721685c848c0f493a288b01 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 15 Jul 2024 16:44:05 +0800
Subject: [PATCH 28/51] Bump
 org.apache.maven.plugins:maven-project-info-reports-plugin (#10585)

Bumps [org.apache.maven.plugins:maven-project-info-reports-plugin](https://github.com/apache/maven-project-info-reports-plugin) from 3.6.1 to 3.6.2.
- [Commits](https://github.com/apache/maven-project-info-reports-plugin/compare/maven-project-info-reports-plugin-3.6.1...maven-project-info-reports-plugin-3.6.2)

---
updated-dependencies:
- dependency-name: org.apache.maven.plugins:maven-project-info-reports-plugin
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 74324d0bd3fb..cfccd813d972 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -470,7 +470,7 @@
         <plugins>
             <plugin>
                 <artifactId>maven-project-info-reports-plugin</artifactId>
-                <version>3.6.1</version>
+                <version>3.6.2</version>
             </plugin>
             <plugin>
                 <groupId>net.alchim31.maven</groupId>

From a81ccab7e5d65f7dee58ea0f731e505d9bceb68c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 15 Jul 2024 16:44:20 +0800
Subject: [PATCH 29/51] Bump org.apache.maven.plugins:maven-release-plugin
 (#10586)

Bumps [org.apache.maven.plugins:maven-release-plugin](https://github.com/apache/maven-release) from 3.0.1 to 3.1.1.
- [Release notes](https://github.com/apache/maven-release/releases)
- [Commits](https://github.com/apache/maven-release/compare/maven-release-3.0.1...maven-release-3.1.1)

---
updated-dependencies:
- dependency-name: org.apache.maven.plugins:maven-release-plugin
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index cfccd813d972..b7311ade59f1 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -143,7 +143,7 @@
                     <plugin>
                         <groupId>org.apache.maven.plugins</groupId>
                         <artifactId>maven-release-plugin</artifactId>
-                        <version>3.0.1</version>
+                        <version>3.1.1</version>
                         <configuration>
                             <autoVersionSubmodules>true</autoVersionSubmodules>
                             <useReleaseProfile>false</useReleaseProfile>

From b7511cbd6f89c8d43be62bd0bac0ec6883e2a5ab Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 15 Jul 2024 16:44:36 +0800
Subject: [PATCH 30/51] Bump net.alchim31.maven:scala-maven-plugin in
 /jvm-packages/xgboost4j (#10536)

Bumps net.alchim31.maven:scala-maven-plugin from 4.9.1 to 4.9.2.

---
updated-dependencies:
- dependency-name: net.alchim31.maven:scala-maven-plugin
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index b7311ade59f1..3ca46ba403fd 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -412,7 +412,7 @@
             <plugin>
                 <groupId>net.alchim31.maven</groupId>
                 <artifactId>scala-maven-plugin</artifactId>
-                <version>4.9.1</version>
+                <version>4.9.2</version>
                 <executions>
                     <execution>
                         <id>compile</id>
@@ -475,7 +475,7 @@
             <plugin>
                 <groupId>net.alchim31.maven</groupId>
                 <artifactId>scala-maven-plugin</artifactId>
-                <version>4.9.1</version>
+                <version>4.9.2</version>
                 <configuration>
                     <jvmArgs>
                         <jvmArg>-Xms64m</jvmArg>

From 17c64300e3aaa41ac2441c63435edc4e8c82edf0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 15 Jul 2024 16:44:50 +0800
Subject: [PATCH 31/51] Bump org.apache.maven.plugins:maven-checkstyle-plugin
 in /jvm-packages (#10518)

Bumps [org.apache.maven.plugins:maven-checkstyle-plugin](https://github.com/apache/maven-checkstyle-plugin) from 3.3.1 to 3.4.0.
- [Commits](https://github.com/apache/maven-checkstyle-plugin/compare/maven-checkstyle-plugin-3.3.1...maven-checkstyle-plugin-3.4.0)

---
updated-dependencies:
- dependency-name: org.apache.maven.plugins:maven-checkstyle-plugin
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 3ca46ba403fd..fb6aaf020c3e 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -394,7 +394,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-checkstyle-plugin</artifactId>
-                <version>3.3.1</version>
+                <version>3.4.0</version>
                 <configuration>
                     <configLocation>checkstyle.xml</configLocation>
                     <failOnViolation>true</failOnViolation>

From ab982e78739a12337c9b4ebef8a4b58978329aa8 Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Mon, 15 Jul 2024 12:44:58 +0200
Subject: [PATCH 32/51] [R] Redesigned `xgboost()` interface skeleton (#10456)

---------

Co-authored-by: Michael Mayer <mayermichael79@gmail.com>
---
 R-package/DESCRIPTION                         |    3 +-
 R-package/NAMESPACE                           |    1 +
 R-package/R/utils.R                           |   34 +
 R-package/R/xgb.Booster.R                     |   20 +-
 R-package/R/xgb.DMatrix.R                     |   30 -
 R-package/R/xgb.dump.R                        |    4 +-
 R-package/R/xgb.importance.R                  |   26 +-
 R-package/R/xgb.model.dt.tree.R               |    5 +-
 R-package/R/xgb.plot.deepness.R               |    5 +-
 R-package/R/xgb.plot.importance.R             |    5 +-
 R-package/R/xgb.plot.multi.trees.R            |    5 +-
 R-package/R/xgb.plot.shap.R                   |   10 +-
 R-package/R/xgb.plot.tree.R                   |    5 +-
 R-package/R/xgb.train.R                       |   12 +-
 R-package/R/xgboost.R                         | 1019 ++++++++++++++++-
 R-package/demo/basic_walkthrough.R            |   23 +-
 R-package/man/print.xgb.Booster.Rd            |    5 +-
 R-package/man/xgb.attr.Rd                     |    5 +-
 R-package/man/xgb.config.Rd                   |    5 +-
 R-package/man/xgb.dump.Rd                     |    4 +-
 R-package/man/xgb.importance.Rd               |   26 +-
 R-package/man/xgb.model.dt.tree.Rd            |    5 +-
 R-package/man/xgb.parameters.Rd               |    5 +-
 R-package/man/xgb.plot.deepness.Rd            |    5 +-
 R-package/man/xgb.plot.importance.Rd          |    5 +-
 R-package/man/xgb.plot.multi.trees.Rd         |    5 +-
 R-package/man/xgb.plot.shap.Rd                |   10 +-
 R-package/man/xgb.plot.tree.Rd                |    5 +-
 R-package/man/xgb.train.Rd                    |   35 +-
 R-package/man/xgboost.Rd                      |  213 ++++
 R-package/tests/testthat/test_xgboost.R       |  623 ++++++++++
 R-package/vignettes/discoverYourData.Rmd      |   29 +-
 R-package/vignettes/xgboostPresentation.Rmd   |   37 +-
 .../feature_interaction_constraint.rst        |    5 +-
 doc/tutorials/monotonic.rst                   |    5 +-
 35 files changed, 1997 insertions(+), 242 deletions(-)
 create mode 100644 R-package/man/xgboost.Rd
 create mode 100644 R-package/tests/testthat/test_xgboost.R

diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 98d31acf8c6c..32f8e70bec7f 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -57,7 +57,8 @@ Suggests:
     igraph (>= 1.0.1),
     float,
     titanic,
-    RhpcBLASctl
+    RhpcBLASctl,
+    survival
 Depends:
     R (>= 4.3.0)
 Imports:
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index c9e085e77e0a..f6cc9062ca4d 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -13,6 +13,7 @@ S3method(predict,xgb.Booster)
 S3method(print,xgb.Booster)
 S3method(print,xgb.DMatrix)
 S3method(print,xgb.cv.synchronous)
+S3method(print,xgboost)
 S3method(setinfo,xgb.Booster)
 S3method(setinfo,xgb.DMatrix)
 S3method(variable.names,xgb.Booster)
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 69f358751dc8..3f67ff23c9f7 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -30,6 +30,40 @@ NVL <- function(x, val) {
   return(c('rank:pairwise', 'rank:ndcg', 'rank:map'))
 }
 
+.OBJECTIVES_NON_DEFAULT_MODE <- function() {
+  return(c("reg:logistic", "binary:logitraw", "multi:softmax"))
+}
+
+.BINARY_CLASSIF_OBJECTIVES <- function() {
+  return(c("binary:logistic", "binary:hinge"))
+}
+
+.MULTICLASS_CLASSIF_OBJECTIVES <- function() {
+  return("multi:softprob")
+}
+
+.SURVIVAL_RIGHT_CENSORING_OBJECTIVES <- function() { # nolint
+  return(c("survival:cox", "survival:aft"))
+}
+
+.SURVIVAL_ALL_CENSORING_OBJECTIVES <- function() { # nolint
+  return("survival:aft")
+}
+
+.REGRESSION_OBJECTIVES <- function() {
+  return(c(
+    "reg:squarederror", "reg:squaredlogerror", "reg:logistic", "reg:pseudohubererror",
+    "reg:absoluteerror", "reg:quantileerror", "count:poisson", "reg:gamma", "reg:tweedie"
+  ))
+}
+
+.MULTI_TARGET_OBJECTIVES <- function() {
+  return(c(
+    "reg:squarederror", "reg:squaredlogerror", "reg:logistic", "reg:pseudohubererror",
+    "reg:quantileerror", "reg:gamma"
+  ))
+}
+
 
 #
 # Low-level functions for boosting --------------------------------------------
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index 77b33f16db44..cfea11ae33c6 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -663,9 +663,8 @@ validate.features <- function(bst, newdata) {
 #' data(agaricus.train, package = "xgboost")
 #' train <- agaricus.train
 #'
-#' bst <- xgboost(
-#'   data = train$data,
-#'   label = train$label,
+#' bst <- xgb.train(
+#'   data = xgb.DMatrix(train$data, label = train$label),
 #'   max_depth = 2,
 #'   eta = 1,
 #'   nthread = 2,
@@ -767,9 +766,8 @@ xgb.attributes <- function(object) {
 #' data.table::setDTthreads(nthread)
 #' train <- agaricus.train
 #'
-#' bst <- xgboost(
-#'   data = train$data,
-#'   label = train$label,
+#' bst <- xgb.train(
+#'   data = xgb.DMatrix(train$data, label = train$label),
 #'   max_depth = 2,
 #'   eta = 1,
 #'   nthread = nthread,
@@ -817,9 +815,8 @@ xgb.config <- function(object) {
 #' data(agaricus.train, package = "xgboost")
 #' train <- agaricus.train
 #'
-#' bst <- xgboost(
-#'   data = train$data,
-#'   label = train$label,
+#' bst <- xgb.train(
+#'   data = xgb.DMatrix(train$data, label = train$label),
 #'   max_depth = 2,
 #'   eta = 1,
 #'   nthread = 2,
@@ -1230,9 +1227,8 @@ xgb.is.same.Booster <- function(obj1, obj2) {
 #' data(agaricus.train, package = "xgboost")
 #' train <- agaricus.train
 #'
-#' bst <- xgboost(
-#'   data = train$data,
-#'   label = train$label,
+#' bst <- xgb.train(
+#'   data = xgb.DMatrix(train$data, label = train$label),
 #'   max_depth = 2,
 #'   eta = 1,
 #'   nthread = 2,
diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
index 15f6faed0ba0..d87d1cbf71c2 100644
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -853,36 +853,6 @@ xgb.DMatrix.hasinfo <- function(object, info) {
 }
 
 
-# get dmatrix from data, label
-# internal helper method
-xgb.get.DMatrix <- function(data, label, missing, weight, nthread) {
-  if (inherits(data, "dgCMatrix") || is.matrix(data)) {
-    if (is.null(label)) {
-      stop("label must be provided when data is a matrix")
-    }
-    dtrain <- xgb.DMatrix(data, label = label, missing = missing, nthread = nthread)
-    if (!is.null(weight)) {
-      setinfo(dtrain, "weight", weight)
-    }
-  } else {
-    if (!is.null(label)) {
-      warning("xgboost: label will be ignored.")
-    }
-    if (is.character(data)) {
-      data <- path.expand(data)
-      dtrain <- xgb.DMatrix(data[1])
-    } else if (inherits(data, "xgb.DMatrix")) {
-      dtrain <- data
-    } else if (inherits(data, "data.frame")) {
-      stop("xgboost doesn't support data.frame as input. Convert it to matrix first.")
-    } else {
-      stop("xgboost: invalid input data")
-    }
-  }
-  return(dtrain)
-}
-
-
 #' Dimensions of xgb.DMatrix
 #'
 #' Returns a vector of numbers of rows and of columns in an \code{xgb.DMatrix}.
diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R
index 2fa5bcb2f628..ef7202a1a5db 100644
--- a/R-package/R/xgb.dump.R
+++ b/R-package/R/xgb.dump.R
@@ -29,8 +29,8 @@
 #' data(agaricus.test, package='xgboost')
 #' train <- agaricus.train
 #' test <- agaricus.test
-#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-#'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+#' bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
+#'                  eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
 #' # save the model in file 'xgb.model.dump'
 #' dump_path = file.path(tempdir(), 'model.dump')
 #' xgb.dump(bst, dump_path, with_stats = TRUE)
diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R
index 547d9677b798..bbf816a0d6cc 100644
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@@ -46,9 +46,8 @@
 #' # binomial classification using "gbtree":
 #' data(agaricus.train, package = "xgboost")
 #'
-#' bst <- xgboost(
-#'   data = agaricus.train$data,
-#'   label = agaricus.train$label,
+#' bst <- xgb.train(
+#'   data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
 #'   max_depth = 2,
 #'   eta = 1,
 #'   nthread = 2,
@@ -59,9 +58,8 @@
 #' xgb.importance(model = bst)
 #'
 #' # binomial classification using "gblinear":
-#' bst <- xgboost(
-#'   data = agaricus.train$data,
-#'   label = agaricus.train$label,
+#' bst <- xgb.train(
+#'   data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
 #'   booster = "gblinear",
 #'   eta = 0.3,
 #'   nthread = 1,
@@ -73,9 +71,11 @@
 #' # multiclass classification using "gbtree":
 #' nclass <- 3
 #' nrounds <- 10
-#' mbst <- xgboost(
-#'   data = as.matrix(iris[, -5]),
-#'   label = as.numeric(iris$Species) - 1,
+#' mbst <- xgb.train(
+#'   data = xgb.DMatrix(
+#'     as.matrix(iris[, -5]),
+#'     label = as.numeric(iris$Species) - 1
+#'   ),
 #'   max_depth = 3,
 #'   eta = 0.2,
 #'   nthread = 2,
@@ -99,9 +99,11 @@
 #' )
 #'
 #' # multiclass classification using "gblinear":
-#' mbst <- xgboost(
-#'   data = scale(as.matrix(iris[, -5])),
-#'   label = as.numeric(iris$Species) - 1,
+#' mbst <- xgb.train(
+#'   data = xgb.DMatrix(
+#'     scale(as.matrix(iris[, -5])),
+#'     label = as.numeric(iris$Species) - 1
+#'   ),
 #'   booster = "gblinear",
 #'   eta = 0.2,
 #'   nthread = 1,
diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R
index ff416b73e38a..73cdecc5c3ae 100644
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -43,9 +43,8 @@
 #' nthread <- 1
 #' data.table::setDTthreads(nthread)
 #'
-#' bst <- xgboost(
-#'   data = agaricus.train$data,
-#'   label = agaricus.train$label,
+#' bst <- xgb.train(
+#'   data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
 #'   max_depth = 2,
 #'   eta = 1,
 #'   nthread = nthread,
diff --git a/R-package/R/xgb.plot.deepness.R b/R-package/R/xgb.plot.deepness.R
index 8e1972374546..956ee9c83fd0 100644
--- a/R-package/R/xgb.plot.deepness.R
+++ b/R-package/R/xgb.plot.deepness.R
@@ -48,9 +48,8 @@
 #' data.table::setDTthreads(nthread)
 #'
 #' ## Change max_depth to a higher number to get a more significant result
-#' bst <- xgboost(
-#'   data = agaricus.train$data,
-#'   label = agaricus.train$label,
+#' bst <- xgb.train(
+#'   data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
 #'   max_depth = 6,
 #'   nthread = nthread,
 #'   nrounds = 50,
diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R
index 1848a3a86e53..199595cb8ddf 100644
--- a/R-package/R/xgb.plot.importance.R
+++ b/R-package/R/xgb.plot.importance.R
@@ -51,9 +51,8 @@
 #' nthread <- 2
 #' data.table::setDTthreads(nthread)
 #'
-#' bst <- xgboost(
-#'   data = agaricus.train$data,
-#'   label = agaricus.train$label,
+#' bst <- xgb.train(
+#'   data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
 #'   max_depth = 3,
 #'   eta = 1,
 #'   nthread = nthread,
diff --git a/R-package/R/xgb.plot.multi.trees.R b/R-package/R/xgb.plot.multi.trees.R
index e6d678ee7a4f..19a114071509 100644
--- a/R-package/R/xgb.plot.multi.trees.R
+++ b/R-package/R/xgb.plot.multi.trees.R
@@ -35,9 +35,8 @@
 #' nthread <- 2
 #' data.table::setDTthreads(nthread)
 #'
-#' bst <- xgboost(
-#'   data = agaricus.train$data,
-#'   label = agaricus.train$label,
+#' bst <- xgb.train(
+#'   data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
 #'   max_depth = 15,
 #'   eta = 1,
 #'   nthread = nthread,
diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R
index 788a095399ed..be3f7116034c 100644
--- a/R-package/R/xgb.plot.shap.R
+++ b/R-package/R/xgb.plot.shap.R
@@ -82,9 +82,8 @@
 #' data.table::setDTthreads(nthread)
 #' nrounds <- 20
 #'
-#' bst <- xgboost(
-#'   agaricus.train$data,
-#'   agaricus.train$label,
+#' bst <- xgb.train(
+#'   data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
 #'   nrounds = nrounds,
 #'   eta = 0.1,
 #'   max_depth = 3,
@@ -108,9 +107,8 @@
 #' set.seed(123)
 #' is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
 #'
-#' mbst <- xgboost(
-#'   data = x,
-#'   label = as.numeric(iris$Species) - 1,
+#' mbst <- xgb.train(
+#'   data = xgb.DMatrix(x, label = as.numeric(iris$Species) - 1),
 #'   nrounds = nrounds,
 #'   max_depth = 2,
 #'   eta = 0.3,
diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R
index 5ed1e70f695a..502de3f52d61 100644
--- a/R-package/R/xgb.plot.tree.R
+++ b/R-package/R/xgb.plot.tree.R
@@ -68,9 +68,8 @@
 #' @examples
 #' data(agaricus.train, package = "xgboost")
 #'
-#' bst <- xgboost(
-#'   data = agaricus.train$data,
-#'   label = agaricus.train$label,
+#' bst <- xgb.train(
+#'   data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
 #'   max_depth = 3,
 #'   eta = 1,
 #'   nthread = 2,
diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
index 30bf1f1ea149..a0933213be5a 100644
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -182,12 +182,6 @@
 #'        as R attributes, and thus do not get saved when using XGBoost's own serializaters like
 #'        \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}).
 #' @param ... other parameters to pass to \code{params}.
-#' @param label vector of response values. Should not be provided when data is
-#'        a local data file name or an \code{xgb.DMatrix}.
-#' @param missing by default is set to NA, which means that NA values should be considered as 'missing'
-#'        by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values.
-#'        This parameter is only used when input is a dense matrix.
-#' @param weight a vector indicating the weight for each row of the input.
 #'
 #' @return
 #' An object of class \code{xgb.Booster}.
@@ -328,12 +322,10 @@
 #'                  early_stopping_rounds = 3)
 #'
 #' ## An 'xgboost' interface example:
-#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
-#'                max_depth = 2, eta = 1, nthread = nthread, nrounds = 2,
-#'                objective = "binary:logistic")
+#' bst <- xgboost(x = agaricus.train$data, y = factor(agaricus.train$label),
+#'                params = list(max_depth = 2, eta = 1), nthread = nthread, nrounds = 2)
 #' pred <- predict(bst, agaricus.test$data)
 #'
-#' @rdname xgb.train
 #' @export
 xgb.train <- function(params = list(), data, nrounds, evals = list(),
                       obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L,
diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R
index a1d37358162c..9ea66731bf81 100644
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -1,32 +1,1005 @@
-# Simple interface for training an xgboost model that wraps \code{xgb.train}.
-# Its documentation is combined with xgb.train.
-#
-#' @rdname xgb.train
+prescreen.parameters <- function(params) {
+  if (!NROW(params)) {
+    return(list())
+  }
+  if (!is.list(params)) {
+    stop("'params' must be a list or NULL.")
+  }
+
+  params <- params[!is.null(params)]
+
+  if ("num_class" %in% names(params)) {
+    stop("'num_class' cannot be manually specified for 'xgboost()'. Pass a factor 'y' instead.")
+  }
+  if ("process_type" %in% names(params)) {
+    if (params$process_type != "default") {
+      stop("Non-default 'process_type' is not supported for 'xgboost()'. Try 'xgb.train()'.")
+    }
+  }
+
+  return(params)
+}
+
+prescreen.objective <- function(objective) {
+  if (!is.null(objective)) {
+    if (objective %in% .OBJECTIVES_NON_DEFAULT_MODE()) {
+      stop(
+        "Objectives with non-default prediction mode (",
+        paste(.OBJECTIVES_NON_DEFAULT_MODE(), collapse = ", "),
+        ") are not supported in 'xgboost()'. Try 'xgb.train()'."
+      )
+    }
+
+    if (!is.character(objective) || length(objective) != 1L || is.na(objective)) {
+      stop("'objective' must be a single character/string variable.")
+    }
+  }
+}
+
+process.base.margin <- function(base_margin, nrows, ncols) {
+  if (!NROW(base_margin)) {
+    return(NULL)
+  }
+  if (is.array(base_margin) && length(dim(base_margin)) > 2) {
+    stop(
+      "'base_margin' should not have more than 2 dimensions for any objective (got: ",
+      length(dim(base_margin)),
+      " dimensions)."
+    )
+  }
+  if (inherits(base_margin, c("sparseMatrix", "sparseVector"))) {
+    warning(
+      "Got a sparse matrix type (class: ",
+      paste(class(base_margin), collapse = ", "),
+      ") for 'base_margin'. Will convert to dense matrix."
+    )
+    base_margin <- as.matrix(base_margin)
+  }
+  if (NROW(base_margin) != nrows) {
+    stop(
+      "'base_margin' has incorrect number of rows. Expected: ",
+      nrows,
+      ". Got: ",
+      NROW(base_margin)
+    )
+  }
+
+  if (ncols == 1L) {
+    if (inherits(base_margin, c("matrix", "data.frame"))) {
+      if (ncol(base_margin) != 1L) {
+        stop("'base_margin' should be a 1-d vector for the given objective and data.")
+      }
+      if (is.data.frame(base_margin)) {
+        base_margin <- base_margin[[1L]]
+      } else {
+        base_margin <- base_margin[, 1L]
+      }
+    }
+    if (!is.numeric(base_margin)) {
+      base_margin <- as.numeric(base_margin)
+    }
+  } else {
+    supported_multicol <- c("matrix", "data.frame")
+    if (!inherits(base_margin, supported_multicol)) {
+      stop(
+        "'base_margin' should be a matrix with ",
+        ncols,
+        " columns for the given objective and data. Got class: ",
+        paste(class(base_margin), collapse = ", ")
+      )
+    }
+    if (ncol(base_margin) != ncols) {
+      stop(
+        "'base_margin' has incorrect number of columns. Expected: ",
+        ncols,
+        ". Got: ",
+        ncol(base_margin)
+      )
+    }
+    if (!is.matrix(base_margin)) {
+      base_margin <- as.matrix(base_margin)
+    }
+  }
+
+  return(base_margin)
+}
+
+process.y.margin.and.objective <- function(
+  y,
+  base_margin,
+  objective,
+  params
+) {
+
+  if (!NROW(y)) {
+    stop("Passed empty 'y'.")
+  }
+
+  if (is.array(y) && length(dim(y)) > 2) {
+    stop(
+      "'y' should not have more than 2 dimensions for any objective (got: ",
+      length(dim(y)),
+      ")."
+    )
+  }
+
+  if (inherits(y, c("sparseMatrix", "sparseVector"))) {
+    warning(
+      "Got a sparse matrix type (class: ",
+      paste(class(y), collapse = ", "),
+      ") for 'y'. Will convert to dense matrix."
+    )
+    y <- as.matrix(y)
+  }
+
+  if (is.character(y)) {
+    if (!is.vector(y)) {
+      if (NCOL(y) > 1L) {
+        stop("Multi-column categorical 'y' is not supported.")
+      }
+      y <- as.vector(y)
+    }
+    y <- factor(y)
+  }
+
+  if (is.logical(y)) {
+    if (!is.vector(y)) {
+      if (NCOL(y) > 1L) {
+        stop("Multi-column logical/boolean 'y' is not supported.")
+      }
+      y <- as.vector(y)
+    }
+    y <- factor(y, c(FALSE, TRUE))
+  }
+
+  if (is.factor(y)) {
+
+    y_levels <- levels(y)
+    if (length(y_levels) < 2) {
+      stop("Factor 'y' has less than 2 levels.")
+    }
+    if (length(y_levels) == 2) {
+      if (is.null(objective)) {
+        objective <- "binary:logistic"
+      } else {
+        if (!(objective %in% .BINARY_CLASSIF_OBJECTIVES())) {
+          stop(
+            "Got binary 'y' - supported objectives for this data are: ",
+            paste(.BINARY_CLASSIF_OBJECTIVES(), collapse = ", "),
+            ". Was passed: ",
+            objective
+          )
+        }
+      }
+
+      if (!is.null(base_margin)) {
+        base_margin <- process.base.margin(base_margin, length(y), 1)
+      }
+
+      out <- list(
+        params = list(
+          objective = objective
+        ),
+        metadata = list(
+          y_levels = y_levels,
+          n_targets = 1
+        )
+      )
+    } else { # length(levels) > 2
+      if (is.null(objective)) {
+        objective <- "multi:softprob"
+      } else {
+        if (!(objective %in% .MULTICLASS_CLASSIF_OBJECTIVES())) {
+          stop(
+            "Got non-binary factor 'y' - supported objectives for this data are: ",
+            paste(.MULTICLASS_CLASSIF_OBJECTIVES(), collapse = ", "),
+            ". Was passed: ",
+            objective
+          )
+        }
+      }
+
+      if (!is.null(base_margin)) {
+        base_margin <- process.base.margin(base_margin, length(y), length(y_levels))
+      }
+
+      out <- list(
+        params = list(
+          objective = objective,
+          num_class = length(y_levels)
+        ),
+        metadata = list(
+          y_levels = y_levels,
+          n_targets = length(y_levels)
+        )
+      )
+    }
+
+    out$dmatrix_args <- list(
+      label = as.numeric(y) - 1,
+      base_margin = base_margin
+    )
+
+  } else if (inherits(y, "Surv")) {
+
+    y_attr <- attributes(y)
+    supported_surv_types <- c("left", "right", "interval")
+    if (!(y_attr$type %in% supported_surv_types)) {
+      stop(
+        "Survival objectives are only supported for types: ",
+        paste(supported_surv_types, collapse = ", "),
+        ". Was passed: ",
+        y_attr$type
+      )
+    }
+
+    if (is.null(objective)) {
+      objective <- "survival:aft"
+    } else {
+      if (y_attr$type == "right") {
+        if (!(objective %in% .SURVIVAL_RIGHT_CENSORING_OBJECTIVES())) {
+          stop(
+            "Got right-censored 'y' variable - supported objectives for this data are: ",
+            paste(.SURVIVAL_RIGHT_CENSORING_OBJECTIVES(), collapse = ", "),
+            ". Was passed: ",
+            objective
+          )
+        }
+      } else {
+        if (!(objective %in% .SURVIVAL_ALL_CENSORING_OBJECTIVES())) {
+          stop(
+            "Got ", y_attr$type, "-censored 'y' variable - supported objectives for this data are:",
+            paste(.SURVIVAL_ALL_CENSORING_OBJECTIVES(), collapse = ", "),
+            ". Was passed: ",
+            objective
+          )
+        }
+      }
+    }
+
+    if (!is.null(base_margin)) {
+      base_margin <- process.base.margin(base_margin, nrow(y), 1)
+    }
+
+    out <- list(
+      params = list(
+        objective = objective
+      ),
+      metadata = list(
+        n_targets = 1
+      )
+    )
+
+    # Note: the 'Surv' object class that is passed as 'y' might have either 2 or 3 columns
+    # depending on the type of censoring, and the last column in both cases is the one that
+    # indicates the observation type (e.g. censored / uncensored).
+    # In the case of interval censoring, the second column will not always have values with
+    # infinites filled in. For more information, see the code behind the 'print.Surv' method.
+
+    if (objective == "survival:cox") {
+      # Can only get here when using right censoring
+      if (y_attr$type != "right") {
+        stop("Internal error.")
+      }
+
+      out$dmatrix_args <- list(
+        label = y[, 1L] * (2 * (y[, 2L] - 0.5))
+      )
+
+    } else {
+      if (y_attr$type == "left") {
+        lb <- ifelse(
+          y[, 2L] == 0,
+          0,
+          y[, 1L]
+        )
+        ub <- y[, 1L]
+        out$dmatrix_args <- list(
+          label_lower_bound = lb,
+          label_upper_bound = ub
+        )
+      } else if (y_attr$type == "right") {
+        lb <- y[, 1L]
+        ub <- ifelse(
+          y[, 2L] == 0,
+          Inf,
+          y[, 1L]
+        )
+        out$dmatrix_args <- list(
+          label_lower_bound = lb,
+          label_upper_bound = ub
+        )
+      } else if (y_attr$type == "interval") {
+        out$dmatrix_args <- list(
+          label_lower_bound = ifelse(y[, 3L] == 2, 0, y[, 1L]),
+          label_upper_bound = ifelse(
+            y[, 3L] == 0, Inf,
+            ifelse(y[, 3L] == 3, y[, 2L], y[, 1L])
+          )
+        )
+      }
+
+      if (min(out$dmatrix_args$label_lower_bound) < 0) {
+        stop("Survival objectives are only defined for non-negative 'y'.")
+      }
+    }
+
+    out$dmatrix_args$base_margin <- base_margin
+
+  } else if (is.vector(y)) {
+
+    if (is.null(objective)) {
+      objective <- "reg:squarederror"
+    } else if (!(objective %in% .REGRESSION_OBJECTIVES())) {
+      stop(
+        "Got numeric 'y' - supported objectives for this data are: ",
+        paste(.REGRESSION_OBJECTIVES(), collapse = ", "),
+        ". Was passed: ",
+        objective
+      )
+    }
+
+    n_targets <- 1L
+    if (objective == "reg:quantileerror" && NROW(params$quantile_alpha) > 1) {
+      n_targets <- NROW(params$quantile_alpha)
+    }
+
+    if (!is.null(base_margin)) {
+      base_margin <- process.base.margin(base_margin, length(y), n_targets)
+    }
+
+    out <- list(
+      params = list(
+        objective = objective
+      ),
+      metadata = list(
+        n_targets = n_targets
+      ),
+      dmatrix_args = list(
+        label = as.numeric(y),
+        base_margin = base_margin
+      )
+    )
+
+  } else if (is.data.frame(y)) {
+    if (ncol(y) == 1L) {
+      return(process.y.margin.and.objective(y[[1L]], base_margin, objective, params))
+    }
+
+    if (is.null(objective)) {
+      objective <- "reg:squarederror"
+    } else if (!(objective %in% .MULTI_TARGET_OBJECTIVES())) {
+      stop(
+        "Got multi-column 'y' - supported objectives for this data are: ",
+        paste(.MULTI_TARGET_OBJECTIVES(), collapse = ", "),
+        ". Was passed: ",
+        objective
+      )
+    }
+
+    y_names <- names(y)
+    y <- lapply(y, function(x) {
+      if (!inherits(x, c("numeric", "integer"))) {
+        stop(
+          "Multi-target 'y' only supports 'numeric' and 'integer' types. Got: ",
+          paste(class(x), collapse = ", ")
+        )
+      }
+      return(as.numeric(x))
+    })
+    y <- as.data.frame(y) |> as.matrix()
+
+    if (!is.null(base_margin)) {
+      base_margin <- process.base.margin(base_margin, length(y), ncol(y))
+    }
+
+    out <- list(
+      params = list(
+        objective = objective
+      ),
+      dmatrix_args = list(
+        label = y,
+        base_margin = base_margin
+      ),
+      metadata = list(
+        y_names = y_names,
+        n_targets = ncol(y)
+      )
+    )
+
+  } else if (is.matrix(y)) {
+    if (ncol(y) == 1L) {
+      return(process.y.margin.and.objective(as.vector(y), base_margin, objective, params))
+    }
+
+    if (!is.null(objective) && !(objective %in% .MULTI_TARGET_OBJECTIVES())) {
+      stop(
+        "Got multi-column 'y' - supported objectives for this data are: ",
+        paste(.MULTI_TARGET_OBJECTIVES(), collapse = ", "),
+        ". Was passed: ",
+        objective
+      )
+    }
+    if (is.null(objective)) {
+      objective <- "reg:squarederror"
+    }
+
+    y_names <- colnames(y)
+    if (storage.mode(y) != "double") {
+      storage.mode(y) <- "double"
+    }
+
+    if (!is.null(base_margin)) {
+      base_margin <- process.base.margin(base_margin, nrow(y), ncol(y))
+    }
+
+    out <- list(
+      params = list(
+        objective = objective
+      ),
+      dmatrix_args = list(
+        label = y,
+        base_margin = base_margin
+      ),
+      metadata = list(
+        n_targets = ncol(y)
+      )
+    )
+
+    if (NROW(y_names) == ncol(y)) {
+      out$metadata$y_names <- y_names
+    }
+
+  } else {
+    stop("Passed 'y' object with unsupported class: ", paste(class(y), collapse = ", "))
+  }
+
+  return(out)
+}
+
+process.row.weights <- function(w, lst_args) {
+  if (!is.null(w)) {
+    if ("label" %in% names(lst_args$dmatrix_args)) {
+      nrow_y <- NROW(lst_args$dmatrix_args$label)
+    } else if ("label_lower_bound" %in% names(lst_args$dmatrix_args)) {
+      nrow_y <- length(lst_args$dmatrix_args$label_lower_bound)
+    } else {
+      stop("Internal error.")
+    }
+    if (!is.numeric(w)) {
+      w <- as.numeric(w)
+    }
+    if (length(w) != nrow_y) {
+      stop(
+        "'weights' must be a 1-d vector with the same length as 'y' (",
+        length(w), " vs. ", nrow_y, ")."
+      )
+    }
+    lst_args$dmatrix_args$weight <- w
+  }
+  return(lst_args)
+}
+
+check.nthreads <- function(nthreads) {
+  if (is.null(nthreads)) {
+    return(1L)
+  }
+  if (!inherits(nthreads, c("numeric", "integer")) || !NROW(nthreads)) {
+    stop("'nthreads' must be a positive scalar value.")
+  }
+  if (length(nthreads) > 1L) {
+    nthreads <- utils::head(nthreads, 1L)
+  }
+  if (is.na(nthreads) || nthreads < 0) {
+    stop("Passed invalid 'nthreads': ", nthreads)
+  }
+  if (is.numeric(nthreads)) {
+    if (floor(nthreads) != nthreads) {
+      stop("'nthreads' must be an integer.")
+    }
+  }
+  return(as.integer(nthreads))
+}
+
+check.can.use.qdm <- function(x, params) {
+  if ("booster" %in% names(params)) {
+    if (params$booster == "gblinear") {
+      return(FALSE)
+    }
+  }
+  if ("tree_method" %in% names(params)) {
+    if (params$tree_method %in% c("exact", "approx")) {
+      return(FALSE)
+    }
+  }
+  return(TRUE)
+}
+
+process.x.and.col.args <- function(
+  x,
+  monotone_constraints,
+  interaction_constraints,
+  feature_weights,
+  lst_args,
+  use_qdm
+) {
+  if (is.null(x)) {
+    stop("'x' cannot be NULL.")
+  }
+  if (inherits(x, "xgb.DMatrix")) {
+    stop("Cannot pass 'xgb.DMatrix' as 'x' to 'xgboost()'. Try 'xgb.train()' instead.")
+  }
+  supported_x_types <- c("data.frame", "matrix", "dgTMatrix", "dgCMatrix", "dgRMatrix")
+  if (!inherits(x, supported_x_types)) {
+    stop(
+      "'x' must be one of the following classes: ",
+      paste(supported_x_types, collapse = ", "),
+      ". Got: ",
+      paste(class(x), collapse = ", ")
+    )
+  }
+  if (use_qdm && inherits(x, "sparseMatrix") && !inherits(x, "dgRMatrix")) {
+    x <- methods::as(x, "RsparseMatrix")
+    if (!inherits(x, "RsparseMatrix")) {
+      stop("Internal error: casting sparse matrix did not yield 'dgRMatrix'.")
+    }
+  }
+
+  if (NROW(feature_weights)) {
+    if (is.list(feature_weights)) {
+      feature_weights <- unlist(feature_weights)
+    }
+    if (!inherits(feature_weights, c("numeric", "integer"))) {
+      stop("'feature_weights' must be a numeric vector or named list matching to columns of 'x'.")
+    }
+    if (NROW(names(feature_weights)) && NROW(colnames(x))) {
+      matched <- match(colnames(x), names(feature_weights))
+      matched <- matched[!is.na(matched)]
+      matched <- matched[!duplicated(matched)]
+      if (length(matched) > 0 && length(matched) < length(feature_weights)) {
+        stop(
+          "'feature_weights' names do not contain all columns of 'x'. Missing: ",
+          utils::head(setdiff(colnames(x), names(feature_weights)))
+        )
+      }
+      if (length(matched)) {
+        feature_weights <- feature_weights[matched]
+      } else {
+        warning("Names of 'feature_weights' do not match with 'x'. Names will be ignored.")
+      }
+    }
+
+    lst_args$dmatrix_args$feature_weights <- unname(feature_weights)
+  }
+
+  if (NROW(monotone_constraints)) {
+
+    if (NROW(monotone_constraints) > ncol(x)) {
+      stop(
+        "'monotone_constraints' contains more entries than there are columns in 'x' (",
+        NROW(monotone_constraints), " vs. ", ncol(x), ")."
+      )
+    }
+
+    if (is.list(monotone_constraints)) {
+
+      if (!NROW(names(monotone_constraints))) {
+        stop(
+          "If passing 'monotone_constraints' as a named list,",
+          " must have names matching to columns of 'x'."
+        )
+      }
+      if (!NROW(colnames(x))) {
+        stop("If passing 'monotone_constraints' as a named list, 'x' must have column names.")
+      }
+      if (anyDuplicated(names(monotone_constraints))) {
+        stop(
+          "'monotone_constraints' contains duplicated names: ",
+          paste(
+            names(monotone_constraints)[duplicated(names(monotone_constraints))] |> utils::head(),
+            collapse = ", "
+          )
+        )
+      }
+      if (NROW(setdiff(names(monotone_constraints), colnames(x)))) {
+        stop(
+          "'monotone_constraints' contains column names not present in 'x': ",
+          paste(utils::head(names(monotone_constraints)), collapse = ", ")
+        )
+      }
+
+      vec_monotone_constr <- rep(0, ncol(x))
+      matched <- match(names(monotone_constraints), colnames(x))
+      vec_monotone_constr[matched] <- unlist(monotone_constraints)
+      lst_args$params$monotone_constraints <- unname(vec_monotone_constr)
+
+    } else if (inherits(monotone_constraints, c("numeric", "integer"))) {
+
+      if (NROW(names(monotone_constraints)) && NROW(colnames(x))) {
+        if (length(monotone_constraints) < ncol(x)) {
+          return(
+            process.x.and.col.args(
+              x,
+              as.list(monotone_constraints),
+              interaction_constraints,
+              feature_weights,
+              lst_args,
+              use_qdm
+            )
+          )
+        } else {
+          matched <- match(names(monotone_constraints), colnames(x))
+          matched <- matched[!is.na(matched)]
+          matched <- matched[!duplicated(matched)]
+          if (length(matched)) {
+            monotone_constraints <- monotone_constraints[matched]
+          } else {
+            warning("Names of 'monotone_constraints' do not match with 'x'. Names will be ignored.")
+          }
+        }
+      } else {
+        if (length(monotone_constraints) != ncol(x)) {
+          stop(
+            "If passing 'monotone_constraints' as unnamed vector or not using column names,",
+            " must have length matching to number of columns in 'x'. Got: ",
+            length(monotone_constraints), " (vs. ", ncol(x), ")"
+          )
+        }
+      }
+
+      lst_args$params$monotone_constraints <- unname(monotone_constraints)
+
+    } else if (is.character(monotone_constraints)) {
+      lst_args$params$monotone_constraints <- monotone_constraints
+    } else {
+      stop(
+        "Passed unsupported type for 'monotone_constraints': ",
+        paste(class(monotone_constraints), collapse = ", ")
+      )
+    }
+  }
+
+  if (NROW(interaction_constraints)) {
+    if (!is.list(interaction_constraints)) {
+      stop("'interaction_constraints' must be a list of vectors.")
+    }
+    cnames <- colnames(x)
+    lst_args$params$interaction_constraints <- lapply(interaction_constraints, function(idx) {
+      if (!NROW(idx)) {
+        stop("Elements in 'interaction_constraints' cannot be empty.")
+      }
+
+      if (is.character(idx)) {
+        if (!NROW(cnames)) {
+          stop(
+            "Passed a character vector for 'interaction_constraints', but 'x' ",
+            "has no column names to match them against."
+          )
+        }
+        out <- match(idx, cnames) - 1L
+        if (anyNA(out)) {
+          stop(
+            "'interaction_constraints' contains column names not present in 'x': ",
+            paste(utils::head(idx[which(is.na(out))]), collapse = ", ")
+          )
+        }
+        return(out)
+      } else if (inherits(idx, c("numeric", "integer"))) {
+        if (anyNA(idx)) {
+          stop("'interaction_constraints' cannot contain NA values.")
+        }
+        if (min(idx) < 1) {
+          stop("Column indices for 'interaction_constraints' must follow base-1 indexing.")
+        }
+        if (max(idx) > ncol(x)) {
+          stop("'interaction_constraints' contains invalid column indices.")
+        }
+        if (is.numeric(idx)) {
+          if (any(idx != floor(idx))) {
+            stop(
+              "'interaction_constraints' must contain only integer indices. Got non-integer: ",
+              paste(utils::head(idx[which(idx != floor(idx))]), collapse = ", ")
+            )
+          }
+        }
+        return(idx - 1L)
+      } else {
+        stop(
+          "Elements in 'interaction_constraints' must be vectors of types ",
+          "'integer', 'numeric', or 'character'. Got: ",
+          paste(class(idx), collapse = ", ")
+        )
+      }
+    })
+  }
+
+  lst_args$dmatrix_args$data <- x
+  return(lst_args)
+}
+
+#' @noMd
 #' @export
-xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
-                    params = list(), nrounds,
-                    verbose = 1, print_every_n = 1L,
-                    early_stopping_rounds = NULL, maximize = NULL,
-                    save_period = NULL, save_name = "xgboost.model",
-                    xgb_model = NULL, callbacks = list(), ...) {
-  merged <- check.booster.params(params, ...)
-  dtrain <- xgb.get.DMatrix(
-    data = data,
-    label = label,
-    missing = missing,
-    weight = weight,
-    nthread = merged$nthread
+#' @title Fit XGBoost Model
+#' @description Fits an XGBoost model (boosted decision tree ensemble) to given x/y data.
+#'
+#' See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/model.html}{
+#' Introduction to Boosted Trees} for a longer explanation of what XGBoost does.
+#'
+#' This function is intended to provide a more user-friendly interface for XGBoost that follows
+#' R's conventions for model fitting and predictions, but which doesn't expose all of the
+#' possible functionalities of the core XGBoost library.
+#'
+#' See \link{xgb.train} for a more flexible low-level alternative which is similar across different
+#' language bindings of XGBoost and which exposes the full library's functionalities.
+#' @details For package authors using `xgboost` as a dependency, it is highly recommended to use
+#' \link{xgb.train} in package code instead of `xgboost()`, since it has a more stable interface
+#' and performs fewer data conversions and copies along the way.
+#' @references \itemize{
+#' \item Chen, Tianqi, and Carlos Guestrin. "Xgboost: A scalable tree boosting system."
+#' Proceedings of the 22nd acm sigkdd international conference on knowledge discovery and
+#' data mining. 2016.
+#' \item \url{https://xgboost.readthedocs.io/en/stable/}
+#' }
+#' @param x The features / covariates. Can be passed as:\itemize{
+#' \item A numeric or integer `matrix`.
+#' \item A `data.frame`, in which all columns are one of the following types:\itemize{
+#'   \item `numeric`
+#'   \item `integer`
+#'   \item `logical`
+#'   \item `factor`
+#' }
+#'
+#' Columns of `factor` type will be assumed to be categorical, while other column types will
+#' be assumed to be numeric.
+#' \item A sparse matrix from the `Matrix` package, either as `dgCMatrix` or `dgRMatrix` class.
+#' }
+#'
+#' Note that categorical features are only supported for `data.frame` inputs, and are automatically
+#' determined based on their types. See \link{xgb.train} with \link{xgb.DMatrix} for more flexible
+#' variants that would allow something like categorical features on sparse matrices.
+#' @param y The response variable. Allowed values are:\itemize{
+#' \item A numeric or integer vector (for regression tasks).
+#' \item A factor or character vector (for binary and multi-class classification tasks).
+#' \item A logical (boolean) vector (for binary classification tasks).
+#' \item A numeric or integer matrix or `data.frame` with numeric/integer columns
+#' (for multi-task regression tasks).
+#' \item A `Surv` object from the `survival` package (for survival tasks).
+#' }
+#'
+#' If `objective` is `NULL`, the right task will be determined automatically based on
+#' the class of `y`.
+#'
+#' If `objective` is not `NULL`, it must match with the type of `y` - e.g. `factor` types of `y`
+#' can only be used with classification objectives and vice-versa.
+#'
+#' For binary classification, the last factor level of `y` will be used as the "positive"
+#' class - that is, the numbers from `predict` will reflect the probabilities of belonging to this
+#' class instead of to the first factor level. If `y` is a `logical` vector, then `TRUE` will be
+#' set as the last level.
+#' @param objective Optimization objective to minimize based on the supplied data, to be passed
+#' by name as a string / character (e.g. `reg:absoluteerror`). See the
+#' \href{https://xgboost.readthedocs.io/en/stable/parameter.html#learning-task-parameters}{
+#' Learning Task Parameters} page for more detailed information on allowed values.
+#'
+#' If `NULL` (the default), will be automatically determined from `y` according to the following
+#' logic:\itemize{
+#' \item If `y` is a factor with 2 levels, will use `binary:logistic`.
+#' \item If `y` is a factor with more than 2 levels, will use `multi:softprob` (number of classes
+#' will be determined automatically, should not be passed under `params`).
+#' \item If `y` is a `Surv` object from the `survival` package, will use `survival:aft` (note that
+#' the only types supported are left / right / interval censored).
+#' \item Otherwise, will use `reg:squarederror`.
+#' }
+#'
+#' If `objective` is not `NULL`, it must match with the type of `y` - e.g. `factor` types of `y`
+#' can only be used with classification objectives and vice-versa.
+#'
+#' Note that not all possible `objective` values supported by the core XGBoost library are allowed
+#' here - for example, objectives which are a variation of another but with a different default
+#' prediction type (e.g. `multi:softmax` vs. `multi:softprob`) are not allowed, and neither are
+#' ranking objectives, nor custom objectives at the moment.
+#' @param nrounds Number of boosting iterations / rounds.
+#'
+#' Note that the number of default boosting rounds here is not automatically tuned, and different
+#' problems will have vastly different optimal numbers of boosting rounds.
+#' @param weights Sample weights for each row in `x` and `y`. If `NULL` (the default), each row
+#' will have the same weight.
+#'
+#' If not `NULL`, should be passed as a numeric vector with length matching to the number of
+#' rows in `x`.
+#' @param verbosity Verbosity of printing messages. Valid values of 0 (silent), 1 (warning),
+#' 2 (info), and 3 (debug).
+#' @param nthreads Number of parallel threads to use. If passing zero, will use all CPU threads.
+#' @param seed Seed to use for random number generation. If passing `NULL`, will draw a random
+#' number using R's PRNG system to use as seed.
+#' @param monotone_constraints Optional monotonicity constraints for features.
+#'
+#' Can be passed either as a named list (when `x` has column names), or as a vector. If passed
+#' as a vector and `x` has column names, will try to match the elements by name.
+#'
+#' A value of `+1` for a given feature makes the model predictions / scores constrained to be
+#' a monotonically increasing function of that feature (that is, as the value of the feature
+#' increases, the model prediction cannot decrease), while a value of `-1` makes it a monotonically
+#' decreasing function. A value of zero imposes no constraint.
+#'
+#' The input for `monotone_constraints` can be a subset of the columns of `x` if named, in which
+#' case the columns that are not referred to in `monotone_constraints` will be assumed to have
+#' a value of zero (no constraint imposed on the model for those features).
+#'
+#' See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/monotonic.html}{
+#' Monotonic Constraints} for a more detailed explanation.
+#' @param interaction_constraints Constraints for interaction representing permitted interactions.
+#' The constraints must be specified in the form of a list of vectors referencing columns in the
+#' data, e.g. `list(c(1, 2), c(3, 4, 5))` (with these numbers being column indices, numeration
+#' starting at 1 - i.e. the first sublist references the first and second columns) or
+#' `list(c("Sepal.Length", "Sepal.Width"), c("Petal.Length", "Petal.Width"))` (references
+#' columns by names), where each vector is a group of indices of features that are allowed to
+#' interact with each other.
+#'
+#' See the tutorial
+#' \href{https://xgboost.readthedocs.io/en/stable/tutorials/feature_interaction_constraint.html}{
+#' Feature Interaction Constraints} for more information.
+#' @param feature_weights Feature weights for column sampling.
+#'
+#' Can be passed either as a vector with length matching to columns of `x`, or as a named
+#' list (only if `x` has column names) with names matching to columns of 'x'. If it is a
+#' named vector, will try to match the entries to column names of `x` by name.
+#'
+#' If `NULL` (the default), all columns will have the same weight.
+#' @param base_margin Base margin used for boosting from existing model.
+#'
+#' If passing it, will start the gradient boosting procedure from the scores that are provided
+#' here - for example, one can pass the raw scores from a previous model, or some per-observation
+#' offset, or similar.
+#'
+#' Should be either a numeric vector or numeric matrix (for multi-class and multi-target objectives)
+#' with the same number of rows as `x` and number of columns corresponding to number of optimization
+#' targets, and should be in the untransformed scale (for example, for objective `binary:logistic`,
+#' it should have log-odds, not probabilities; and for objective `multi:softprob`, should have
+#' number of columns matching to number of classes in the data).
+#'
+#' Note that, if it contains more than one column, then columns will not be matched by name to
+#' the corresponding `y` - `base_margin` should have the same column order that the model will use
+#' (for example, for objective `multi:softprob`, columns of `base_margin` will be matched against
+#' `levels(y)` by their position, regardless of what `colnames(base_margin)` returns).
+#'
+#' If `NULL`, will start from zero, but note that for most objectives, an intercept is usually
+#' added (controllable through parameter `base_score` instead) when `base_margin` is not passed.
+#' @param ... Other training parameters. See the online documentation
+#' \href{https://xgboost.readthedocs.io/en/stable/parameter.html}{XGBoost Parameters} for
+#' details about possible values and what they do.
+#'
+#' Note that not all possible values from the core XGBoost library are allowed as `params` for
+#' 'xgboost()' - in particular, values which require an already-fitted booster object (such as
+#' `process_type`) are not accepted here.
+#' @return A model object, inheriting from both `xgboost` and `xgb.Booster`. Compared to the regular
+#' `xgb.Booster` model class produced by \link{xgb.train}, this `xgboost` class will have an
+#' additional attribute `metadata` containing information which is used for formatting prediction
+#' outputs, such as class names for classification problems.
+#' @examples
+#' library(xgboost)
+#' data(mtcars)
+#'
+#' # Fit a small regression model on the mtcars data
+#' model_regression <- xgboost(mtcars[, -1], mtcars$mpg, nthreads = 1, nrounds = 3)
+#' predict(model_regression, mtcars, validate_features = TRUE)
+#'
+#' # Task objective is determined automatically according to the type of 'y'
+#' data(iris)
+#' model_classif <- xgboost(iris[, -5], iris$Species, nthreads = 1, nrounds = 5)
+#' predict(model_classif, iris, validate_features = TRUE)
+xgboost <- function(
+  x,
+  y,
+  objective = NULL,
+  nrounds = 100L,
+  weights = NULL,
+  verbosity = 0L,
+  nthreads = parallel::detectCores(),
+  seed = 0L,
+  monotone_constraints = NULL,
+  interaction_constraints = NULL,
+  feature_weights = NULL,
+  base_margin = NULL,
+  ...
+) {
+  # Note: '...' is a workaround, to be removed later by making all parameters be arguments
+  params <- list(...)
+  params <- prescreen.parameters(params)
+  prescreen.objective(objective)
+  use_qdm <- check.can.use.qdm(x, params)
+  lst_args <- process.y.margin.and.objective(y, base_margin, objective, params)
+  lst_args <- process.row.weights(weights, lst_args)
+  lst_args <- process.x.and.col.args(
+    x,
+    monotone_constraints,
+    interaction_constraints,
+    feature_weights,
+    lst_args,
+    use_qdm
+  )
+
+  if (use_qdm && "max_bin" %in% names(params)) {
+    lst_args$dmatrix_args$max_bin <- params$max_bin
+  }
+
+  nthreads <- check.nthreads(nthreads)
+  lst_args$dmatrix_args$nthread <- nthreads
+  lst_args$params$nthread <- nthreads
+  lst_args$params$seed <- seed
+
+  params <- c(lst_args$params, params)
+
+  fn_dm <- if (use_qdm) xgb.QuantileDMatrix else xgb.DMatrix
+  dm <- do.call(fn_dm, lst_args$dmatrix_args)
+  model <- xgb.train(
+    params = params,
+    data = dm,
+    nrounds = nrounds,
+    verbose = verbosity
   )
+  attributes(model)$metadata <- lst_args$metadata
+  attributes(model)$call <- match.call()
+  class(model) <- c("xgboost", class(model))
+  return(model)
+}
 
-  evals <- list(train = dtrain)
+#' @export
+print.xgboost <- function(x, ...) {
+  cat("XGBoost model object\n")
+  cat("Call:\n  ")
+  print(attributes(x)$call)
+  cat("Objective: ", attributes(x)$params$objective, "\n", sep = "")
+  cat("Number of iterations: ", xgb.get.num.boosted.rounds(x), "\n", sep = "")
+  cat("Number of features: ", xgb.num_feature(x), "\n", sep = "")
 
-  bst <- xgb.train(params, dtrain, nrounds, evals, verbose = verbose, print_every_n = print_every_n,
-                   early_stopping_rounds = early_stopping_rounds, maximize = maximize,
-                   save_period = save_period, save_name = save_name,
-                   xgb_model = xgb_model, callbacks = callbacks, ...)
-  return(bst)
+  printable_head <- function(v) {
+    v_sub <- utils::head(v, 5L)
+    return(
+      sprintf(
+        "%s%s",
+        paste(v_sub, collapse = ", "),
+        ifelse(length(v_sub) < length(v), ", ...", "")
+      )
+    )
+  }
+
+  if (NROW(attributes(x)$metadata$y_levels)) {
+    cat(
+      "Classes: ",
+      printable_head(attributes(x)$metadata$y_levels),
+      "\n",
+      sep = ""
+    )
+  } else if (NROW(attributes(x)$params$quantile_alpha)) {
+    cat(
+      "Prediction quantile",
+      ifelse(length(attributes(x)$params$quantile_alpha) > 1L, "s", ""),
+      ": ",
+      printable_head(attributes(x)$params$quantile_alpha),
+      "\n",
+      sep = ""
+    )
+  } else if (NROW(attributes(x)$metadata$y_names)) {
+    cat(
+      "Prediction targets: ",
+      printable_head(attributes(x)$metadata$y_names),
+      "\n",
+      sep = ""
+    )
+  } else if (attributes(x)$metadata$n_targets > 1L) {
+    cat(
+      "Number of predition targets: ",
+      attributes(x)$metadata$n_targets,
+      "\n",
+      sep = ""
+    )
+  }
+
+  return(x)
 }
 
+
 #' Training part from Mushroom Data Set
 #'
 #' This data set is originally from the Mushroom data set,
diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R
index 9403bac2064c..c65790109fc2 100644
--- a/R-package/demo/basic_walkthrough.R
+++ b/R-package/demo/basic_walkthrough.R
@@ -16,29 +16,28 @@ class(train$data)
 # note: we are putting in sparse matrix here, xgboost naturally handles sparse input
 # use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector)
 print("Training xgboost with sparseMatrix")
-bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1, nrounds = 2,
-               nthread = 2, objective = "binary:logistic")
+bst <- xgboost(x = train$data, y = factor(train$label, c(0, 1)),
+               params = list(max_depth = 2, eta = 1),
+               nrounds = 2, nthread = 2)
 # alternatively, you can put in dense matrix, i.e. basic R-matrix
 print("Training xgboost with Matrix")
-bst <- xgboost(data = as.matrix(train$data), label = train$label, max_depth = 2, eta = 1, nrounds = 2,
-               nthread = 2, objective = "binary:logistic")
+bst <- xgboost(x = as.matrix(train$data), y = factor(train$label, c(0, 1)),
+               params = list(max_depth = 2, eta = 1),
+               nrounds = 2, nthread = 2)
 
 # you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features
 print("Training xgboost with xgb.DMatrix")
 dtrain <- xgb.DMatrix(data = train$data, label = train$label)
-bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, nthread = 2,
-               objective = "binary:logistic")
+params <- list(max_depth = 2, eta = 1, nthread = 2, objective = "binary:logistic")
+bst <- xgb.train(data = dtrain, params = params, nrounds = 2)
 
 # Verbose = 0,1,2
 print("Train xgboost with verbose 0, no message")
-bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2,
-               nthread = 2, objective = "binary:logistic", verbose = 0)
+bst <- xgb.train(data = dtrain, params = params, nrounds = 2, verbose = 0)
 print("Train xgboost with verbose 1, print evaluation metric")
-bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2,
-               nthread = 2, objective = "binary:logistic", verbose = 1)
+bst <- xgb.train(data = dtrain, params = params, nrounds = 2, verbose = 1)
 print("Train xgboost with verbose 2, also print information about tree")
-bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2,
-               nthread = 2, objective = "binary:logistic", verbose = 2)
+bst <- xgb.train(data = dtrain, params = params, nrounds = 2, verbose = 2)
 
 # you can also specify data as file path to a LIBSVM format input
 # since we do not have this file with us, the following line is just for illustration
diff --git a/R-package/man/print.xgb.Booster.Rd b/R-package/man/print.xgb.Booster.Rd
index 9a783efaff27..fc055318cd01 100644
--- a/R-package/man/print.xgb.Booster.Rd
+++ b/R-package/man/print.xgb.Booster.Rd
@@ -21,9 +21,8 @@ Print information about \code{xgb.Booster}.
 data(agaricus.train, package = "xgboost")
 train <- agaricus.train
 
-bst <- xgboost(
-  data = train$data,
-  label = train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(train$data, label = train$label),
   max_depth = 2,
   eta = 1,
   nthread = 2,
diff --git a/R-package/man/xgb.attr.Rd b/R-package/man/xgb.attr.Rd
index 8038a2048b70..f23e9234018a 100644
--- a/R-package/man/xgb.attr.Rd
+++ b/R-package/man/xgb.attr.Rd
@@ -64,9 +64,8 @@ example of these behaviors).
 data(agaricus.train, package = "xgboost")
 train <- agaricus.train
 
-bst <- xgboost(
-  data = train$data,
-  label = train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(train$data, label = train$label),
   max_depth = 2,
   eta = 1,
   nthread = 2,
diff --git a/R-package/man/xgb.config.Rd b/R-package/man/xgb.config.Rd
index 1ab810644db9..dbad1d8cf043 100644
--- a/R-package/man/xgb.config.Rd
+++ b/R-package/man/xgb.config.Rd
@@ -35,9 +35,8 @@ nthread <- 1
 data.table::setDTthreads(nthread)
 train <- agaricus.train
 
-bst <- xgboost(
-  data = train$data,
-  label = train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(train$data, label = train$label),
   max_depth = 2,
   eta = 1,
   nthread = nthread,
diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd
index 6f97f69244b9..199ede1583f8 100644
--- a/R-package/man/xgb.dump.Rd
+++ b/R-package/man/xgb.dump.Rd
@@ -49,8 +49,8 @@ data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test
-bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-               eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
+                 eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
 # save the model in file 'xgb.model.dump'
 dump_path = file.path(tempdir(), 'model.dump')
 xgb.dump(bst, dump_path, with_stats = TRUE)
diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd
index 73b91e8b4b28..76574b9cbf06 100644
--- a/R-package/man/xgb.importance.Rd
+++ b/R-package/man/xgb.importance.Rd
@@ -70,9 +70,8 @@ be on the same scale (which is also recommended when using L1 or L2 regularizati
 # binomial classification using "gbtree":
 data(agaricus.train, package = "xgboost")
 
-bst <- xgboost(
-  data = agaricus.train$data,
-  label = agaricus.train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
   max_depth = 2,
   eta = 1,
   nthread = 2,
@@ -83,9 +82,8 @@ bst <- xgboost(
 xgb.importance(model = bst)
 
 # binomial classification using "gblinear":
-bst <- xgboost(
-  data = agaricus.train$data,
-  label = agaricus.train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
   booster = "gblinear",
   eta = 0.3,
   nthread = 1,
@@ -97,9 +95,11 @@ xgb.importance(model = bst)
 # multiclass classification using "gbtree":
 nclass <- 3
 nrounds <- 10
-mbst <- xgboost(
-  data = as.matrix(iris[, -5]),
-  label = as.numeric(iris$Species) - 1,
+mbst <- xgb.train(
+  data = xgb.DMatrix(
+    as.matrix(iris[, -5]),
+    label = as.numeric(iris$Species) - 1
+  ),
   max_depth = 3,
   eta = 0.2,
   nthread = 2,
@@ -123,9 +123,11 @@ xgb.importance(
 )
 
 # multiclass classification using "gblinear":
-mbst <- xgboost(
-  data = scale(as.matrix(iris[, -5])),
-  label = as.numeric(iris$Species) - 1,
+mbst <- xgb.train(
+  data = xgb.DMatrix(
+    scale(as.matrix(iris[, -5])),
+    label = as.numeric(iris$Species) - 1
+  ),
   booster = "gblinear",
   eta = 0.2,
   nthread = 1,
diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd
index 75f1cd0f4f77..e9536767986c 100644
--- a/R-package/man/xgb.model.dt.tree.Rd
+++ b/R-package/man/xgb.model.dt.tree.Rd
@@ -63,9 +63,8 @@ data(agaricus.train, package = "xgboost")
 nthread <- 1
 data.table::setDTthreads(nthread)
 
-bst <- xgboost(
-  data = agaricus.train$data,
-  label = agaricus.train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
   max_depth = 2,
   eta = 1,
   nthread = nthread,
diff --git a/R-package/man/xgb.parameters.Rd b/R-package/man/xgb.parameters.Rd
index 8d5044cab5cc..82977dc122d4 100644
--- a/R-package/man/xgb.parameters.Rd
+++ b/R-package/man/xgb.parameters.Rd
@@ -33,9 +33,8 @@ will reset its number of rounds indicator to zero.
 data(agaricus.train, package = "xgboost")
 train <- agaricus.train
 
-bst <- xgboost(
-  data = train$data,
-  label = train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(train$data, label = train$label),
   max_depth = 2,
   eta = 1,
   nthread = 2,
diff --git a/R-package/man/xgb.plot.deepness.Rd b/R-package/man/xgb.plot.deepness.Rd
index 43c0dac777f6..3da8e384e4a1 100644
--- a/R-package/man/xgb.plot.deepness.Rd
+++ b/R-package/man/xgb.plot.deepness.Rd
@@ -73,9 +73,8 @@ nthread <- 2
 data.table::setDTthreads(nthread)
 
 ## Change max_depth to a higher number to get a more significant result
-bst <- xgboost(
-  data = agaricus.train$data,
-  label = agaricus.train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
   max_depth = 6,
   nthread = nthread,
   nrounds = 50,
diff --git a/R-package/man/xgb.plot.importance.Rd b/R-package/man/xgb.plot.importance.Rd
index e9c5930c2d57..a9ebcbd2732a 100644
--- a/R-package/man/xgb.plot.importance.Rd
+++ b/R-package/man/xgb.plot.importance.Rd
@@ -88,9 +88,8 @@ data(agaricus.train)
 nthread <- 2
 data.table::setDTthreads(nthread)
 
-bst <- xgboost(
-  data = agaricus.train$data,
-  label = agaricus.train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
   max_depth = 3,
   eta = 1,
   nthread = nthread,
diff --git a/R-package/man/xgb.plot.multi.trees.Rd b/R-package/man/xgb.plot.multi.trees.Rd
index 7fa75c85d886..eae84d98edfd 100644
--- a/R-package/man/xgb.plot.multi.trees.Rd
+++ b/R-package/man/xgb.plot.multi.trees.Rd
@@ -67,9 +67,8 @@ data(agaricus.train, package = "xgboost")
 nthread <- 2
 data.table::setDTthreads(nthread)
 
-bst <- xgboost(
-  data = agaricus.train$data,
-  label = agaricus.train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
   max_depth = 15,
   eta = 1,
   nthread = nthread,
diff --git a/R-package/man/xgb.plot.shap.Rd b/R-package/man/xgb.plot.shap.Rd
index b460fa1fb3a6..f2d2ea2a05e6 100644
--- a/R-package/man/xgb.plot.shap.Rd
+++ b/R-package/man/xgb.plot.shap.Rd
@@ -135,9 +135,8 @@ nthread <- 1
 data.table::setDTthreads(nthread)
 nrounds <- 20
 
-bst <- xgboost(
-  agaricus.train$data,
-  agaricus.train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
   nrounds = nrounds,
   eta = 0.1,
   max_depth = 3,
@@ -161,9 +160,8 @@ x <- as.matrix(iris[, -5])
 set.seed(123)
 is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
 
-mbst <- xgboost(
-  data = x,
-  label = as.numeric(iris$Species) - 1,
+mbst <- xgb.train(
+  data = xgb.DMatrix(x, label = as.numeric(iris$Species) - 1),
   nrounds = nrounds,
   max_depth = 2,
   eta = 0.3,
diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd
index 69d37301dde6..6064107fc184 100644
--- a/R-package/man/xgb.plot.tree.Rd
+++ b/R-package/man/xgb.plot.tree.Rd
@@ -96,9 +96,8 @@ This function uses \href{https://www.graphviz.org/}{GraphViz} as DiagrammeR back
 \examples{
 data(agaricus.train, package = "xgboost")
 
-bst <- xgboost(
-  data = agaricus.train$data,
-  label = agaricus.train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
   max_depth = 3,
   eta = 1,
   nthread = 2,
diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd
index f641b1374420..fc970e4fb493 100644
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -1,8 +1,7 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/xgb.train.R, R/xgboost.R
+% Please edit documentation in R/xgb.train.R
 \name{xgb.train}
 \alias{xgb.train}
-\alias{xgboost}
 \title{eXtreme Gradient Boosting Training}
 \usage{
 xgb.train(
@@ -22,24 +21,6 @@ xgb.train(
   callbacks = list(),
   ...
 )
-
-xgboost(
-  data = NULL,
-  label = NULL,
-  missing = NA,
-  weight = NULL,
-  params = list(),
-  nrounds,
-  verbose = 1,
-  print_every_n = 1L,
-  early_stopping_rounds = NULL,
-  maximize = NULL,
-  save_period = NULL,
-  save_name = "xgboost.model",
-  xgb_model = NULL,
-  callbacks = list(),
-  ...
-)
 }
 \arguments{
 \item{params}{the list of parameters. The complete list of parameters is
@@ -240,15 +221,6 @@ to customize the training process.
 }\if{html}{\out{</div>}}}
 
 \item{...}{other parameters to pass to \code{params}.}
-
-\item{label}{vector of response values. Should not be provided when data is
-a local data file name or an \code{xgb.DMatrix}.}
-
-\item{missing}{by default is set to NA, which means that NA values should be considered as 'missing'
-by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values.
-This parameter is only used when input is a dense matrix.}
-
-\item{weight}{a vector indicating the weight for each row of the input.}
 }
 \value{
 An object of class \code{xgb.Booster}.
@@ -383,9 +355,8 @@ bst <- xgb.train(param, dtrain, nrounds = 25, evals = evals,
                  early_stopping_rounds = 3)
 
 ## An 'xgboost' interface example:
-bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
-               max_depth = 2, eta = 1, nthread = nthread, nrounds = 2,
-               objective = "binary:logistic")
+bst <- xgboost(x = agaricus.train$data, y = factor(agaricus.train$label),
+               params = list(max_depth = 2, eta = 1), nthread = nthread, nrounds = 2)
 pred <- predict(bst, agaricus.test$data)
 
 }
diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd
new file mode 100644
index 000000000000..4af8f25ecc04
--- /dev/null
+++ b/R-package/man/xgboost.Rd
@@ -0,0 +1,213 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgboost.R
+\name{xgboost}
+\alias{xgboost}
+\title{Fit XGBoost Model}
+\usage{
+xgboost(
+  x,
+  y,
+  objective = NULL,
+  nrounds = 100L,
+  weights = NULL,
+  verbosity = 0L,
+  nthreads = parallel::detectCores(),
+  seed = 0L,
+  monotone_constraints = NULL,
+  interaction_constraints = NULL,
+  feature_weights = NULL,
+  base_margin = NULL,
+  ...
+)
+}
+\arguments{
+\item{x}{The features / covariates. Can be passed as:\itemize{
+\item A numeric or integer `matrix`.
+\item A `data.frame`, in which all columns are one of the following types:\itemize{
+  \item `numeric`
+  \item `integer`
+  \item `logical`
+  \item `factor`
+}
+
+Columns of `factor` type will be assumed to be categorical, while other column types will
+be assumed to be numeric.
+\item A sparse matrix from the `Matrix` package, either as `dgCMatrix` or `dgRMatrix` class.
+}
+
+Note that categorical features are only supported for `data.frame` inputs, and are automatically
+determined based on their types. See \link{xgb.train} with \link{xgb.DMatrix} for more flexible
+variants that would allow something like categorical features on sparse matrices.}
+
+\item{y}{The response variable. Allowed values are:\itemize{
+\item A numeric or integer vector (for regression tasks).
+\item A factor or character vector (for binary and multi-class classification tasks).
+\item A logical (boolean) vector (for binary classification tasks).
+\item A numeric or integer matrix or `data.frame` with numeric/integer columns
+(for multi-task regression tasks).
+\item A `Surv` object from the `survival` package (for survival tasks).
+}
+
+If `objective` is `NULL`, the right task will be determined automatically based on
+the class of `y`.
+
+If `objective` is not `NULL`, it must match with the type of `y` - e.g. `factor` types of `y`
+can only be used with classification objectives and vice-versa.
+
+For binary classification, the last factor level of `y` will be used as the "positive"
+class - that is, the numbers from `predict` will reflect the probabilities of belonging to this
+class instead of to the first factor level. If `y` is a `logical` vector, then `TRUE` will be
+set as the last level.}
+
+\item{objective}{Optimization objective to minimize based on the supplied data, to be passed
+by name as a string / character (e.g. `reg:absoluteerror`). See the
+\href{https://xgboost.readthedocs.io/en/stable/parameter.html#learning-task-parameters}{
+Learning Task Parameters} page for more detailed information on allowed values.
+
+If `NULL` (the default), will be automatically determined from `y` according to the following
+logic:\itemize{
+\item If `y` is a factor with 2 levels, will use `binary:logistic`.
+\item If `y` is a factor with more than 2 levels, will use `multi:softprob` (number of classes
+will be determined automatically, should not be passed under `params`).
+\item If `y` is a `Surv` object from the `survival` package, will use `survival:aft` (note that
+the only types supported are left / right / interval censored).
+\item Otherwise, will use `reg:squarederror`.
+}
+
+If `objective` is not `NULL`, it must match with the type of `y` - e.g. `factor` types of `y`
+can only be used with classification objectives and vice-versa.
+
+Note that not all possible `objective` values supported by the core XGBoost library are allowed
+here - for example, objectives which are a variation of another but with a different default
+prediction type (e.g. `multi:softmax` vs. `multi:softprob`) are not allowed, and neither are
+ranking objectives, nor custom objectives at the moment.}
+
+\item{nrounds}{Number of boosting iterations / rounds.
+
+Note that the number of default boosting rounds here is not automatically tuned, and different
+problems will have vastly different optimal numbers of boosting rounds.}
+
+\item{weights}{Sample weights for each row in `x` and `y`. If `NULL` (the default), each row
+will have the same weight.
+
+If not `NULL`, should be passed as a numeric vector with length matching to the number of
+rows in `x`.}
+
+\item{verbosity}{Verbosity of printing messages. Valid values of 0 (silent), 1 (warning),
+2 (info), and 3 (debug).}
+
+\item{nthreads}{Number of parallel threads to use. If passing zero, will use all CPU threads.}
+
+\item{seed}{Seed to use for random number generation. If passing `NULL`, will draw a random
+number using R's PRNG system to use as seed.}
+
+\item{monotone_constraints}{Optional monotonicity constraints for features.
+
+Can be passed either as a named list (when `x` has column names), or as a vector. If passed
+as a vector and `x` has column names, will try to match the elements by name.
+
+A value of `+1` for a given feature makes the model predictions / scores constrained to be
+a monotonically increasing function of that feature (that is, as the value of the feature
+increases, the model prediction cannot decrease), while a value of `-1` makes it a monotonically
+decreasing function. A value of zero imposes no constraint.
+
+The input for `monotone_constraints` can be a subset of the columns of `x` if named, in which
+case the columns that are not referred to in `monotone_constraints` will be assumed to have
+a value of zero (no constraint imposed on the model for those features).
+
+See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/monotonic.html}{
+Monotonic Constraints} for a more detailed explanation.}
+
+\item{interaction_constraints}{Constraints for interaction representing permitted interactions.
+The constraints must be specified in the form of a list of vectors referencing columns in the
+data, e.g. `list(c(1, 2), c(3, 4, 5))` (with these numbers being column indices, numeration
+starting at 1 - i.e. the first sublist references the first and second columns) or
+`list(c("Sepal.Length", "Sepal.Width"), c("Petal.Length", "Petal.Width"))` (references
+columns by names), where each vector is a group of indices of features that are allowed to
+interact with each other.
+
+See the tutorial
+\href{https://xgboost.readthedocs.io/en/stable/tutorials/feature_interaction_constraint.html}{
+Feature Interaction Constraints} for more information.}
+
+\item{feature_weights}{Feature weights for column sampling.
+
+Can be passed either as a vector with length matching to columns of `x`, or as a named
+list (only if `x` has column names) with names matching to columns of 'x'. If it is a
+named vector, will try to match the entries to column names of `x` by name.
+
+If `NULL` (the default), all columns will have the same weight.}
+
+\item{base_margin}{Base margin used for boosting from existing model.
+
+If passing it, will start the gradient boosting procedure from the scores that are provided
+here - for example, one can pass the raw scores from a previous model, or some per-observation
+offset, or similar.
+
+Should be either a numeric vector or numeric matrix (for multi-class and multi-target objectives)
+with the same number of rows as `x` and number of columns corresponding to number of optimization
+targets, and should be in the untransformed scale (for example, for objective `binary:logistic`,
+it should have log-odds, not probabilities; and for objective `multi:softprob`, should have
+number of columns matching to number of classes in the data).
+
+Note that, if it contains more than one column, then columns will not be matched by name to
+the corresponding `y` - `base_margin` should have the same column order that the model will use
+(for example, for objective `multi:softprob`, columns of `base_margin` will be matched against
+`levels(y)` by their position, regardless of what `colnames(base_margin)` returns).
+
+If `NULL`, will start from zero, but note that for most objectives, an intercept is usually
+added (controllable through parameter `base_score` instead) when `base_margin` is not passed.}
+
+\item{...}{Other training parameters. See the online documentation
+\href{https://xgboost.readthedocs.io/en/stable/parameter.html}{XGBoost Parameters} for
+details about possible values and what they do.
+
+Note that not all possible values from the core XGBoost library are allowed as `params` for
+'xgboost()' - in particular, values which require an already-fitted booster object (such as
+`process_type`) are not accepted here.}
+}
+\value{
+A model object, inheriting from both `xgboost` and `xgb.Booster`. Compared to the regular
+`xgb.Booster` model class produced by \link{xgb.train}, this `xgboost` class will have an
+additional attribute `metadata` containing information which is used for formatting prediction
+outputs, such as class names for classification problems.
+}
+\description{
+Fits an XGBoost model (boosted decision tree ensemble) to given x/y data.
+
+See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/model.html}{
+Introduction to Boosted Trees} for a longer explanation of what XGBoost does.
+
+This function is intended to provide a more user-friendly interface for XGBoost that follows
+R's conventions for model fitting and predictions, but which doesn't expose all of the
+possible functionalities of the core XGBoost library.
+
+See \link{xgb.train} for a more flexible low-level alternative which is similar across different
+language bindings of XGBoost and which exposes the full library's functionalities.
+}
+\details{
+For package authors using `xgboost` as a dependency, it is highly recommended to use
+\link{xgb.train} in package code instead of `xgboost()`, since it has a more stable interface
+and performs fewer data conversions and copies along the way.
+}
+\examples{
+library(xgboost)
+data(mtcars)
+
+# Fit a small regression model on the mtcars data
+model_regression <- xgboost(mtcars[, -1], mtcars$mpg, nthreads = 1, nrounds = 3)
+predict(model_regression, mtcars, validate_features = TRUE)
+
+# Task objective is determined automatically according to the type of 'y'
+data(iris)
+model_classif <- xgboost(iris[, -5], iris$Species, nthreads = 1, nrounds = 5)
+predict(model_classif, iris, validate_features = TRUE)
+}
+\references{
+\itemize{
+\item Chen, Tianqi, and Carlos Guestrin. "Xgboost: A scalable tree boosting system."
+Proceedings of the 22nd acm sigkdd international conference on knowledge discovery and
+data mining. 2016.
+\item \url{https://xgboost.readthedocs.io/en/stable/}
+}
+}
diff --git a/R-package/tests/testthat/test_xgboost.R b/R-package/tests/testthat/test_xgboost.R
new file mode 100644
index 000000000000..a4ac658a11b8
--- /dev/null
+++ b/R-package/tests/testthat/test_xgboost.R
@@ -0,0 +1,623 @@
+library(survival)
+library(data.table)
+
+test_that("Auto determine objective", {
+  y_num <- seq(1, 10)
+  res_num <- process.y.margin.and.objective(y_num, NULL, NULL, NULL)
+  expect_equal(res_num$params$objective, "reg:squarederror")
+
+  y_bin <- factor(c('a', 'b', 'a', 'b'), c('a', 'b'))
+  res_bin <- process.y.margin.and.objective(y_bin, NULL, NULL, NULL)
+  expect_equal(res_bin$params$objective, "binary:logistic")
+
+  y_multi <- factor(c('a', 'b', 'a', 'b', 'c'), c('a', 'b', 'c'))
+  res_multi <- process.y.margin.and.objective(y_multi, NULL, NULL, NULL)
+  expect_equal(res_multi$params$objective, "multi:softprob")
+
+  y_surv <- Surv(1:10, rep(c(0, 1), 5), type = "right")
+  res_surv <- process.y.margin.and.objective(y_surv, NULL, NULL, NULL)
+  expect_equal(res_surv$params$objective, "survival:aft")
+
+  y_multicol <- matrix(seq(1, 20), nrow = 5)
+  res_multicol <- process.y.margin.and.objective(y_multicol, NULL, NULL, NULL)
+  expect_equal(res_multicol$params$objective, "reg:squarederror")
+})
+
+test_that("Process vectors", {
+  y <- seq(1, 10)
+  for (y_inp in list(as.integer(y), as.numeric(y))) {
+    res <- process.y.margin.and.objective(y_inp, NULL, "reg:pseudohubererror", NULL)
+    expect_equal(
+      res$dmatrix_args$label,
+      y
+    )
+    expect_equal(
+      res$params$objective,
+      "reg:pseudohubererror"
+    )
+  }
+})
+
+test_that("Process factors", {
+  y_bin <- factor(c('a', 'b', 'a', 'b'), c('a', 'b'))
+  expect_error({
+    process.y.margin.and.objective(y_bin, NULL, "multi:softprob", NULL)
+  })
+  for (bin_obj in c("binary:logistic", "binary:hinge")) {
+    for (y_inp in list(y_bin, as.ordered(y_bin))) {
+      res_bin <- process.y.margin.and.objective(y_inp, NULL, bin_obj, NULL)
+      expect_equal(
+        res_bin$dmatrix_args$label,
+        c(0, 1, 0, 1)
+      )
+      expect_equal(
+        res_bin$metadata$y_levels,
+        c('a', 'b')
+      )
+      expect_equal(
+        res_bin$params$objective,
+        bin_obj
+      )
+    }
+  }
+
+  y_bin2 <- factor(c(1, 0, 1, 0), c(1, 0))
+  res_bin <- process.y.margin.and.objective(y_bin2, NULL, "binary:logistic", NULL)
+  expect_equal(
+    res_bin$dmatrix_args$label,
+    c(0, 1, 0, 1)
+  )
+  expect_equal(
+    res_bin$metadata$y_levels,
+    c("1", "0")
+  )
+
+  y_bin3 <- c(TRUE, FALSE, TRUE)
+  res_bin <- process.y.margin.and.objective(y_bin3, NULL, "binary:logistic", NULL)
+  expect_equal(
+    res_bin$dmatrix_args$label,
+    c(1, 0, 1)
+  )
+  expect_equal(
+    res_bin$metadata$y_levels,
+    c("FALSE", "TRUE")
+  )
+
+  y_multi <- factor(c('a', 'b', 'c', 'd', 'a', 'b'), c('a', 'b', 'c', 'd'))
+  expect_error({
+    process.y.margin.and.objective(y_multi, NULL, "binary:logistic", NULL)
+  })
+  expect_error({
+    process.y.margin.and.objective(y_multi, NULL, "binary:logistic", NULL)
+  })
+  res_multi <- process.y.margin.and.objective(y_multi, NULL, "multi:softprob", NULL)
+  expect_equal(
+    res_multi$dmatrix_args$label,
+    c(0, 1, 2, 3, 0, 1)
+  )
+  expect_equal(
+    res_multi$metadata$y_levels,
+    c('a', 'b', 'c', 'd')
+  )
+  expect_equal(
+    res_multi$params$num_class,
+    4
+  )
+  expect_equal(
+    res_multi$params$objective,
+    "multi:softprob"
+  )
+})
+
+test_that("Process survival objects", {
+  data(cancer, package = "survival")
+  y_right <- Surv(cancer$time, cancer$status - 1, type = "right")
+  res_cox <- process.y.margin.and.objective(y_right, NULL, "survival:cox", NULL)
+  expect_equal(
+    res_cox$dmatrix_args$label,
+    ifelse(cancer$status == 2, cancer$time, -cancer$time)
+  )
+  expect_equal(
+    res_cox$params$objective,
+    "survival:cox"
+  )
+
+  res_aft <- process.y.margin.and.objective(y_right, NULL, "survival:aft", NULL)
+  expect_equal(
+    res_aft$dmatrix_args$label_lower_bound,
+    cancer$time
+  )
+  expect_equal(
+    res_aft$dmatrix_args$label_upper_bound,
+    ifelse(cancer$status == 2, cancer$time, Inf)
+  )
+  expect_equal(
+    res_aft$params$objective,
+    "survival:aft"
+  )
+
+  y_left <- Surv(seq(1, 4), c(1, 0, 1, 0), type = "left")
+  expect_error({
+    process.y.margin.and.objective(y_left, NULL, "survival:cox", NULL)
+  })
+  res_aft <- process.y.margin.and.objective(y_left, NULL, "survival:aft", NULL)
+  expect_equal(
+    res_aft$dmatrix_args$label_lower_bound,
+    c(1, 0, 3, 0)
+  )
+  expect_equal(
+    res_aft$dmatrix_args$label_upper_bound,
+    seq(1, 4)
+  )
+  expect_equal(
+    res_aft$params$objective,
+    "survival:aft"
+  )
+
+  y_interval <- Surv(
+    time = c(1, 5, 2, 10, 3),
+    time2 = c(2, 5, 2.5, 10, 3),
+    event = c(3, 1, 3, 0, 2),
+    type = "interval"
+  )
+  expect_error({
+    process.y.margin.and.objective(y_interval, NULL, "survival:cox", NULL)
+  })
+  res_aft <- process.y.margin.and.objective(y_interval, NULL, "survival:aft", NULL)
+  expect_equal(
+    res_aft$dmatrix_args$label_lower_bound,
+    c(1, 5, 2, 10, 0)
+  )
+  expect_equal(
+    res_aft$dmatrix_args$label_upper_bound,
+    c(2, 5, 2.5, Inf, 3)
+  )
+  expect_equal(
+    res_aft$params$objective,
+    "survival:aft"
+  )
+
+  y_interval_neg <- Surv(
+    time = c(1, -5, 2, 10, 3),
+    time2 = c(2, -5, 2.5, 10, 3),
+    event = c(3, 1, 3, 0, 2),
+    type = "interval"
+  )
+  expect_error({
+    process.y.margin.and.objective(y_interval_neg, NULL, "survival:aft", NULL)
+  })
+})
+
+test_that("Process multi-target", {
+  data(mtcars)
+  y_multi <- data.frame(
+    y1 = mtcars$mpg,
+    y2 = mtcars$mpg ^ 2
+  )
+  for (y_inp in list(y_multi, as.matrix(y_multi), data.table::as.data.table(y_multi))) {
+    res_multi <- process.y.margin.and.objective(y_inp, NULL, "reg:pseudohubererror", NULL)
+    expect_equal(
+      res_multi$dmatrix_args$label,
+      as.matrix(y_multi)
+    )
+    expect_equal(
+      res_multi$metadata$y_names,
+      c("y1", "y2")
+    )
+    expect_equal(
+      res_multi$params$objective,
+      "reg:pseudohubererror"
+    )
+  }
+
+  expect_error({
+    process.y.margin.and.objective(y_multi, NULL, "count:poisson", NULL)
+  })
+
+  y_bad <- data.frame(
+    c1 = seq(1, 3),
+    c2 = rep(as.Date("2024-01-01"), 3)
+  )
+  expect_error({
+    process.y.margin.and.objective(y_bad, NULL, "reg:squarederror", NULL)
+  })
+
+  y_bad <- data.frame(
+    c1 = seq(1, 3),
+    c2 = factor(c('a', 'b', 'a'), c('a', 'b'))
+  )
+  expect_error({
+    process.y.margin.and.objective(y_bad, NULL, "reg:squarederror", NULL)
+  })
+
+  y_bad <- seq(1, 20)
+  dim(y_bad) <- c(5, 2, 2)
+  expect_error({
+    process.y.margin.and.objective(y_bad, NULL, "reg:squarederror", NULL)
+  })
+})
+
+test_that("Process base_margin", {
+  y <- seq(101, 110)
+  bm_good <- seq(1, 10)
+  for (bm in list(bm_good, as.matrix(bm_good), as.data.frame(as.matrix(bm_good)))) {
+    res <- process.y.margin.and.objective(y, bm, "reg:squarederror", NULL)
+    expect_equal(
+      res$dmatrix_args$base_margin,
+      seq(1, 10)
+    )
+  }
+  expect_error({
+    process.y.margin.and.objective(y, 5, "reg:squarederror", NULL)
+  })
+  expect_error({
+    process.y.margin.and.objective(y, seq(1, 5), "reg:squarederror", NULL)
+  })
+  expect_error({
+    process.y.margin.and.objective(y, matrix(seq(1, 20), ncol = 2), "reg:squarederror", NULL)
+  })
+  expect_error({
+    process.y.margin.and.objective(
+      y,
+      as.data.frame(matrix(seq(1, 20), ncol = 2)),
+      "reg:squarederror",
+      NULL
+    )
+  })
+
+  y <- factor(c('a', 'b', 'c', 'a'))
+  bm_good <- matrix(seq(1, 12), ncol = 3)
+  for (bm in list(bm_good, as.data.frame(bm_good))) {
+    res <- process.y.margin.and.objective(y, bm, "multi:softprob", NULL)
+    expect_equal(
+      res$dmatrix_args$base_margin |> unname(),
+      matrix(seq(1, 12), ncol = 3)
+    )
+  }
+  expect_error({
+    process.y.margin.and.objective(y, as.numeric(bm_good), "multi:softprob", NULL)
+  })
+  expect_error({
+    process.y.margin.and.objective(y, 5, "multi:softprob", NULL)
+  })
+  expect_error({
+    process.y.margin.and.objective(y, bm_good[, 1], "multi:softprob", NULL)
+  })
+  expect_error({
+    process.y.margin.and.objective(y, bm_good[, c(1, 2)], "multi:softprob", NULL)
+  })
+  expect_error({
+    process.y.margin.and.objective(y, bm_good[c(1, 2), ], "multi:softprob", NULL)
+  })
+
+  y <- seq(101, 110)
+  bm_good <- matrix(seq(1, 30), ncol = 3)
+  params <- list(quantile_alpha = c(0.1, 0.5, 0.9))
+  for (bm in list(bm_good, as.data.frame(bm_good))) {
+    res <- process.y.margin.and.objective(y, bm, "reg:quantileerror", params)
+    expect_equal(
+      res$dmatrix_args$base_margin |> unname(),
+      matrix(seq(1, 30), ncol = 3)
+    )
+  }
+  expect_error({
+    process.y.margin.and.objective(y, as.numeric(bm_good), "reg:quantileerror", params)
+  })
+  expect_error({
+    process.y.margin.and.objective(y, 5, "reg:quantileerror", params)
+  })
+  expect_error({
+    process.y.margin.and.objective(y, bm_good[, 1], "reg:quantileerror", params)
+  })
+  expect_error({
+    process.y.margin.and.objective(y, bm_good[, c(1, 2)], "reg:quantileerror", params)
+  })
+  expect_error({
+    process.y.margin.and.objective(y, bm_good[c(1, 2, 3), ], "reg:quantileerror", params)
+  })
+
+  y <- matrix(seq(101, 130), ncol = 3)
+  for (bm in list(bm_good, as.data.frame(bm_good))) {
+    res <- process.y.margin.and.objective(y, bm, "reg:squarederror", params)
+    expect_equal(
+      res$dmatrix_args$base_margin |> unname(),
+      matrix(seq(1, 30), ncol = 3)
+    )
+  }
+  expect_error({
+    process.y.margin.and.objective(y, as.numeric(bm_good), "reg:squarederror", params)
+  })
+  expect_error({
+    process.y.margin.and.objective(y, 5, "reg:squarederror", params)
+  })
+  expect_error({
+    process.y.margin.and.objective(y, bm_good[, 1], "reg:squarederror", params)
+  })
+  expect_error({
+    process.y.margin.and.objective(y, bm_good[, c(1, 2)], "reg:squarederror", params)
+  })
+  expect_error({
+    process.y.margin.and.objective(y, bm_good[c(1, 2, 3), ], "reg:squarederror", params)
+  })
+})
+
+test_that("Process monotone constraints", {
+  data(iris)
+  mc_list <- list(Sepal.Width = 1)
+  res <- process.x.and.col.args(
+    iris,
+    monotone_constraints = mc_list,
+    interaction_constraints = NULL,
+    feature_weights = NULL,
+    lst_args = list(),
+    use_qdm = FALSE
+  )
+  expect_equal(
+    res$params$monotone_constraints,
+    c(0, 1, 0, 0, 0)
+  )
+
+  mc_list2 <- list(Sepal.Width = 1, Petal.Width = -1)
+  res <- process.x.and.col.args(
+    iris,
+    monotone_constraints = mc_list2,
+    interaction_constraints = NULL,
+    feature_weights = NULL,
+    lst_args = list(),
+    use_qdm = FALSE
+  )
+  expect_equal(
+    res$params$monotone_constraints,
+    c(0, 1, 0, -1, 0)
+  )
+
+  mc_vec <- c(0, 1, -1, 0, 0)
+  res <- process.x.and.col.args(
+    iris,
+    monotone_constraints = mc_vec,
+    interaction_constraints = NULL,
+    feature_weights = NULL,
+    lst_args = list(),
+    use_qdm = FALSE
+  )
+  expect_equal(
+    res$params$monotone_constraints,
+    c(0, 1, -1, 0, 0)
+  )
+
+  mc_named_vec <- c(1, 1)
+  names(mc_named_vec) <- names(iris)[1:2]
+  res <- process.x.and.col.args(
+    iris,
+    monotone_constraints = mc_named_vec,
+    interaction_constraints = NULL,
+    feature_weights = NULL,
+    lst_args = list(),
+    use_qdm = FALSE
+  )
+  expect_equal(
+    res$params$monotone_constraints,
+    c(1, 1, 0, 0, 0)
+  )
+
+  mc_named_all <- c(0, -1, 1, 0, -1)
+  names(mc_named_all) <- rev(names(iris))
+  res <- process.x.and.col.args(
+    iris,
+    monotone_constraints = mc_named_all,
+    interaction_constraints = NULL,
+    feature_weights = NULL,
+    lst_args = list(),
+    use_qdm = FALSE
+  )
+  expect_equal(
+    res$params$monotone_constraints,
+    rev(mc_named_all) |> unname()
+  )
+
+  expect_error({
+    process.x.and.col.args(
+      iris,
+      monotone_constraints = list(
+        Sepal.Width = 1,
+        Petal.Width = -1,
+        Sepal.Width = -1
+      ),
+      interaction_constraints = NULL,
+      feature_weights = NULL,
+      lst_args = list(),
+      use_qdm = FALSE
+    )
+  })
+
+  expect_error({
+    process.x.and.col.args(
+      iris,
+      monotone_constraints = rep(0, 6),
+      interaction_constraints = NULL,
+      feature_weights = NULL,
+      lst_args = list(),
+      use_qdm = FALSE
+    )
+  })
+})
+
+test_that("Process interaction_constraints", {
+  data(iris)
+  res <- process.x.and.col.args(iris, NULL, list(c(1L, 2L)), NULL, NULL, FALSE)
+  expect_equal(
+    res$params$interaction_constraints,
+    list(c(0, 1))
+  )
+  res <- process.x.and.col.args(iris, NULL, list(c(1.0, 2.0)), NULL, NULL, FALSE)
+  expect_equal(
+    res$params$interaction_constraints,
+    list(c(0, 1))
+  )
+  res <- process.x.and.col.args(iris, NULL, list(c(1, 2), c(3, 4)), NULL, NULL, FALSE)
+  expect_equal(
+    res$params$interaction_constraints,
+    list(c(0, 1), c(2, 3))
+  )
+  res <- process.x.and.col.args(
+    iris, NULL, list(c("Sepal.Length", "Sepal.Width")), NULL, NULL, FALSE
+  )
+  expect_equal(
+    res$params$interaction_constraints,
+    list(c(0, 1))
+  )
+  res <- process.x.and.col.args(
+    as.matrix(iris),
+    NULL,
+    list(c("Sepal.Length", "Sepal.Width")),
+    NULL,
+    NULL,
+    FALSE
+  )
+  expect_equal(
+    res$params$interaction_constraints,
+    list(c(0, 1))
+  )
+  res <- process.x.and.col.args(
+    iris,
+    NULL,
+    list(c("Sepal.Width", "Petal.Length"), c("Sepal.Length", "Petal.Width", "Species")),
+    NULL,
+    NULL,
+    FALSE
+  )
+  expect_equal(
+    res$params$interaction_constraints,
+    list(c(1, 2), c(0, 3, 4))
+  )
+
+  expect_error({
+    process.x.and.col.args(iris, NULL, list(c(1L, 20L)), NULL, NULL, FALSE)
+  })
+  expect_error({
+    process.x.and.col.args(iris, NULL, list(c(0L, 2L)), NULL, NULL, FALSE)
+  })
+  expect_error({
+    process.x.and.col.args(iris, NULL, list(c("1", "2")), NULL, NULL, FALSE)
+  })
+  expect_error({
+    process.x.and.col.args(iris, NULL, list(c("Sepal", "Petal")), NULL, NULL, FALSE)
+  })
+  expect_error({
+    process.x.and.col.args(iris, NULL, c(1L, 2L), NULL, NULL, FALSE)
+  })
+  expect_error({
+    process.x.and.col.args(iris, NULL, matrix(c(1L, 2L)), NULL, NULL, FALSE)
+  })
+  expect_error({
+    process.x.and.col.args(iris, NULL, list(c(1, 2.5)), NULL, NULL, FALSE)
+  })
+})
+
+test_that("Sparse matrices are casted to CSR for QDM", {
+  data(agaricus.test, package = "xgboost")
+  x <- agaricus.test$data
+  for (x_in in list(x, methods::as(x, "TsparseMatrix"))) {
+    res <- process.x.and.col.args(
+      x_in,
+      NULL,
+      NULL,
+      NULL,
+      NULL,
+      TRUE
+    )
+    expect_s4_class(res$dmatrix_args$data, "dgRMatrix")
+  }
+})
+
+test_that("Process feature_weights", {
+  data(iris)
+  w_vector <- seq(1, 5)
+  res <-  process.x.and.col.args(
+    iris,
+    monotone_constraints = NULL,
+    interaction_constraints = NULL,
+    feature_weights = w_vector,
+    lst_args = list(),
+    use_qdm = FALSE
+  )
+  expect_equal(
+    res$dmatrix_args$feature_weights,
+    seq(1, 5)
+  )
+
+  w_named_vector <- seq(1, 5)
+  names(w_named_vector) <- rev(names(iris))
+  res <-  process.x.and.col.args(
+    iris,
+    monotone_constraints = NULL,
+    interaction_constraints = NULL,
+    feature_weights = w_named_vector,
+    lst_args = list(),
+    use_qdm = FALSE
+  )
+  expect_equal(
+    res$dmatrix_args$feature_weights,
+    rev(seq(1, 5))
+  )
+
+  w_list <- list(
+    Species = 5,
+    Sepal.Length = 1,
+    Sepal.Width = 2,
+    Petal.Length = 3,
+    Petal.Width = 4
+  )
+  res <- process.x.and.col.args(
+    iris,
+    monotone_constraints = NULL,
+    interaction_constraints = NULL,
+    feature_weights = w_list,
+    lst_args = list(),
+    use_qdm = FALSE
+  )
+  expect_equal(
+    res$dmatrix_args$feature_weights,
+    seq(1, 5)
+  )
+})
+
+test_that("Whole function works", {
+  data(cancer, package = "survival")
+  y <- Surv(cancer$time, cancer$status - 1, type = "right")
+  x <- as.data.table(cancer)[, -c("time", "status")]
+  model <- xgboost(
+    x,
+    y,
+    monotone_constraints = list(age = -1),
+    nthreads = 1L,
+    nrounds = 5L,
+    eta = 3
+  )
+  expect_equal(
+    attributes(model)$params$objective,
+    "survival:aft"
+  )
+  expect_equal(
+    attributes(model)$metadata$n_targets,
+    1L
+  )
+  expect_equal(
+    attributes(model)$params$monotone_constraints,
+    "(0,-1,0,0,0,0,0,0)"
+  )
+  expect_false(
+    "interaction_constraints" %in% names(attributes(model)$params)
+  )
+  expect_equal(
+    attributes(model)$params$eta,
+    3
+  )
+  txt <- capture.output({
+    print(model)
+  })
+  expect_true(any(grepl("Objective: survival:aft", txt, fixed = TRUE)))
+  expect_true(any(grepl("monotone_constraints", txt, fixed = TRUE)))
+  expect_true(any(grepl("Number of iterations: 5", txt, fixed = TRUE)))
+  expect_true(any(grepl("Number of features: 8", txt, fixed = TRUE)))
+})
diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd
index 4b04f771f210..8347d0ee0a84 100644
--- a/R-package/vignettes/discoverYourData.Rmd
+++ b/R-package/vignettes/discoverYourData.Rmd
@@ -173,8 +173,9 @@ Build the model
 The code below is very usual. For more information, you can look at the documentation of `xgboost` function (or at the vignette [XGBoost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)).
 
 ```{r}
-bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4,
-               eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic")
+bst <- xgboost(x = sparse_matrix, y = output_vector,
+               params = list(max_depth = 4, eta = 1),
+               nthread = 2, nrounds = 10)
 
 ```
 
@@ -299,28 +300,28 @@ test <- agaricus.test
 
 #Random Forest - 1000 trees
 bst <- xgboost(
-    data = train$data,
-    label = train$label,
-    max_depth = 4,
-    num_parallel_tree = 1000,
-    subsample = 0.5,
-    colsample_bytree = 0.5,
+    x = train$data,
+    y = factor(train$label, levels = c(0, 1)),
+    params = list(
+        max_depth = 4,
+        num_parallel_tree = 1000,
+        subsample = 0.5,
+        colsample_bytree = 0.5
+    ),
     nrounds = 1,
-    objective = "binary:logistic",
     nthread = 2
 )
 
 #Boosting - 3 rounds
 bst <- xgboost(
-    data = train$data,
-    label = train$label,
-    max_depth = 4,
+    x = train$data,
+    y = factor(train$label, levels = c(0, 1)),
+    params = list(max_depth = 4),
     nrounds = 3,
-    objective = "binary:logistic",
     nthread = 2
 )
 ```
 
-> Note that the parameter `round` is set to `1`.
+> Note that the parameter `nrounds` is set to `1`.
 
 > [**Random Forests**](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_papers.htm) is a trademark of Leo Breiman and Adele Cutler and is licensed exclusively to Salford Systems for the commercial release of the software.
diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd
index fc49adc0fcee..d1ca4f2879a7 100644
--- a/R-package/vignettes/xgboostPresentation.Rmd
+++ b/R-package/vignettes/xgboostPresentation.Rmd
@@ -146,22 +146,19 @@ In a *sparse* matrix, cells containing `0` are not stored in memory. Therefore,
 
 We will train decision tree model using the following parameters:
 
-* `objective = "binary:logistic"`: we will train a binary classification model ;
+* `objective = "binary:logistic"`: we will train a binary classification model (note that this is set automatically when `y` is a `factor`) ;
 * `max_depth = 2`: the trees won't be deep, because our case is very simple ;
 * `nthread = 2`: the number of CPU threads we are going to use;
 * `nrounds = 2`: there will be two passes on the data, the second one will enhance the model by further reducing the difference between ground truth and prediction.
 
 ```{r trainingSparse, message=F, warning=F}
 bstSparse <- xgboost(
-    data = train$data
-    , label = train$label
-    , params = list(
-        max_depth = 2
-        , eta = 1
-        , nthread = 2
-        , objective = "binary:logistic"
-    )
+    x = train$data
+    , y = factor(train$label, levels = c(0, 1))
+    , objective = "binary:logistic"
+    , params = list(max_depth = 2, eta = 1)
     , nrounds = 2
+    , nthread = 2
 )
 ```
 
@@ -175,15 +172,11 @@ Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R**
 
 ```{r trainingDense, message=F, warning=F}
 bstDense <- xgboost(
-    data = as.matrix(train$data),
-    label = train$label,
-    params = list(
-        max_depth = 2,
-        eta = 1,
-        nthread = 2,
-        objective = "binary:logistic"
-    ),
-    nrounds = 2
+    x = as.matrix(train$data),
+    y = factor(train$label, levels = c(0, 1)),
+    params = list(max_depth = 2, eta = 1),
+    nrounds = 2,
+    nthread = 2
 )
 ```
 
@@ -193,7 +186,7 @@ bstDense <- xgboost(
 
 ```{r trainingDmatrix, message=F, warning=F}
 dtrain <- xgb.DMatrix(data = train$data, label = train$label, nthread = 2)
-bstDMatrix <- xgboost(
+bstDMatrix <- xgb.train(
     data = dtrain,
     params = list(
         max_depth = 2,
@@ -213,7 +206,7 @@ One of the simplest way to see the training progress is to set the `verbose` opt
 
 ```{r trainingVerbose0, message=T, warning=F}
 # verbose = 0, no message
-bst <- xgboost(
+bst <- xgb.train(
     data = dtrain
     , params = list(
         max_depth = 2
@@ -228,7 +221,7 @@ bst <- xgboost(
 
 ```{r trainingVerbose1, message=T, warning=F}
 # verbose = 1, print evaluation metric
-bst <- xgboost(
+bst <- xgb.train(
     data = dtrain
     , params = list(
         max_depth = 2
@@ -243,7 +236,7 @@ bst <- xgboost(
 
 ```{r trainingVerbose2, message=T, warning=F}
 # verbose = 2, also print information about tree
-bst <- xgboost(
+bst <- xgb.train(
     data = dtrain
     , params = list(
         max_depth = 2
diff --git a/doc/tutorials/feature_interaction_constraint.rst b/doc/tutorials/feature_interaction_constraint.rst
index b3d655584b95..7f26cd437325 100644
--- a/doc/tutorials/feature_interaction_constraint.rst
+++ b/doc/tutorials/feature_interaction_constraint.rst
@@ -178,9 +178,10 @@ parameter:
 Using feature name instead
 **************************
 
-XGBoost's Python package supports using feature names instead of feature index for
+XGBoost's Python and R packages support using feature names instead of feature index for
 specifying the constraints. Given a data frame with columns ``["f0", "f1", "f2"]``, the
-feature interaction constraint can be specified as ``[["f0", "f2"]]``.
+feature interaction constraint can be specified as ``[["f0", "f2"]]`` (Python) or
+``list(c("f0", "f2"))`` (R, when passing them to function ``xgboost()``).
 
 **************
 Advanced topic
diff --git a/doc/tutorials/monotonic.rst b/doc/tutorials/monotonic.rst
index e663d1109689..6868e0a56037 100644
--- a/doc/tutorials/monotonic.rst
+++ b/doc/tutorials/monotonic.rst
@@ -97,7 +97,8 @@ Some other examples:
 Using feature names
 *******************
 
-XGBoost's Python package supports using feature names instead of feature index for
+XGBoost's Python and R packages support using feature names instead of feature indices for
 specifying the constraints. Given a data frame with columns ``["f0", "f1", "f2"]``, the
-monotonic constraint can be specified as ``{"f0": 1, "f2": -1}``, and ``"f1"`` will
+monotonic constraint can be specified as ``{"f0": 1, "f2": -1}`` (Python) or as
+``list(f0=1, f2=-1)`` (R, when using 'xgboost()', but not 'xgb.train'), and ``"f1"`` will
 default to ``0`` (no constraint).

From bbd308595adc996e34ac9604fe52ca62f715e586 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 15 Jul 2024 20:21:25 +0800
Subject: [PATCH 33/51] [jvm-packages] Bump rapids version. (#10588)

---
 jvm-packages/pom.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index fb6aaf020c3e..6b34a906dd60 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -44,8 +44,8 @@
         <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
         <log.capi.invocation>OFF</log.capi.invocation>
         <use.cuda>OFF</use.cuda>
-        <cudf.version>24.04.0</cudf.version>
-        <spark.rapids.version>24.04.1</spark.rapids.version>
+        <cudf.version>24.06.0</cudf.version>
+        <spark.rapids.version>24.06.0</spark.rapids.version>
         <cudf.classifier>cuda12</cudf.classifier>
         <scalatest.version>3.2.18</scalatest.version>
         <scala-collection-compat.version>2.12.0</scala-collection-compat.version>

From fa8fea145a6ace953d1f78b3ef81f2f12544ce44 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 15 Jul 2024 20:22:04 +0800
Subject: [PATCH 34/51] Bump scalatest.version from 3.2.18 to 3.2.19 in
 /jvm-packages/xgboost4j (#10535)

Bumps `scalatest.version` from 3.2.18 to 3.2.19.

Updates `org.scalatest:scalatest_2.12` from 3.2.18 to 3.2.19
- [Release notes](https://github.com/scalatest/scalatest/releases)
- [Commits](https://github.com/scalatest/scalatest/compare/release-3.2.18...release-3.2.19)

Updates `org.scalactic:scalactic_2.12` from 3.2.18 to 3.2.19
- [Release notes](https://github.com/scalatest/scalatest/releases)
- [Commits](https://github.com/scalatest/scalatest/compare/release-3.2.18...release-3.2.19)

---
updated-dependencies:
- dependency-name: org.scalatest:scalatest_2.12
  dependency-type: direct:development
  update-type: version-update:semver-patch
- dependency-name: org.scalactic:scalactic_2.12
  dependency-type: direct:development
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 6b34a906dd60..3770ddba92c8 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -47,7 +47,7 @@
         <cudf.version>24.06.0</cudf.version>
         <spark.rapids.version>24.06.0</spark.rapids.version>
         <cudf.classifier>cuda12</cudf.classifier>
-        <scalatest.version>3.2.18</scalatest.version>
+        <scalatest.version>3.2.19</scalatest.version>
         <scala-collection-compat.version>2.12.0</scala-collection-compat.version>
         <skip.native.build>false</skip.native.build>
 

From 370dce9d57b8ce16947e0b2f772ed20160058391 Mon Sep 17 00:00:00 2001
From: RektPunk <110188257+RektPunk@users.noreply.github.com>
Date: Tue, 16 Jul 2024 00:35:42 +0900
Subject: [PATCH 35/51] [Doc] Fix CRAN badge in README [skip ci] (#10587)

* Change http to https in Badges

* Change all http to https
---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index b27cce673585..1126a588d8ae 100644
--- a/README.md
+++ b/README.md
@@ -4,8 +4,8 @@
 [![Build Status](https://badge.buildkite.com/aca47f40a32735c00a8550540c5eeff6a4c1d246a580cae9b0.svg?branch=master)](https://buildkite.com/xgboost/xgboost-ci)
 [![XGBoost-CI](https://github.com/dmlc/xgboost/workflows/XGBoost-CI/badge.svg?branch=master)](https://github.com/dmlc/xgboost/actions)
 [![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](https://xgboost.readthedocs.org)
-[![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
-[![CRAN Status Badge](http://www.r-pkg.org/badges/version/xgboost)](http://cran.r-project.org/web/packages/xgboost)
+[![GitHub license](https://dmlc.github.io/img/apache2.svg)](./LICENSE)
+[![CRAN Status Badge](https://www.r-pkg.org/badges/version/xgboost)](https://cran.r-project.org/web/packages/xgboost)
 [![PyPI version](https://badge.fury.io/py/xgboost.svg)](https://pypi.python.org/pypi/xgboost/)
 [![Conda version](https://img.shields.io/conda/vn/conda-forge/py-xgboost.svg)](https://anaconda.org/conda-forge/py-xgboost)
 [![Optuna](https://img.shields.io/badge/Optuna-integrated-blue)](https://optuna.org)
@@ -35,7 +35,7 @@ Checkout the [Community Page](https://xgboost.ai/community).
 
 Reference
 ---------
-- Tianqi Chen and Carlos Guestrin. [XGBoost: A Scalable Tree Boosting System](http://arxiv.org/abs/1603.02754). In 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016
+- Tianqi Chen and Carlos Guestrin. [XGBoost: A Scalable Tree Boosting System](https://arxiv.org/abs/1603.02754). In 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016
 - XGBoost originates from research project at University of Washington.
 
 Sponsors

From 5a92ffe3cab897f82cce9060bd9057eb178a79e9 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 16 Jul 2024 17:41:50 +0800
Subject: [PATCH 36/51] Partial fix for CTK 12.5 (#10574)

---
 src/tree/updater_gpu_common.cuh | 21 ++++++++-------------
 src/tree/updater_gpu_hist.cu    |  1 +
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh
index 1c3e6a55277d..5d999d6d6e01 100644
--- a/src/tree/updater_gpu_common.cuh
+++ b/src/tree/updater_gpu_common.cuh
@@ -1,18 +1,13 @@
-/*!
- * Copyright 2017-2019 XGBoost contributors
+/**
+ * Copyright 2017-2024, XGBoost contributors
  */
 #pragma once
-#include <thrust/random.h>
-#include <cstdio>
-#include <cub/cub.cuh>
-#include <stdexcept>
-#include <string>
-#include <vector>
-#include "../common/categorical.h"
-#include "../common/device_helpers.cuh"
-#include "../common/random.h"
+#include <limits>   // for numeric_limits
+#include <ostream>  // for ostream
+
 #include "gpu_hist/histogram.cuh"
 #include "param.h"
+#include "xgboost/base.h"
 
 namespace xgboost::tree {
 struct GPUTrainingParam {
@@ -54,8 +49,8 @@ enum DefaultDirection {
 };
 
 struct DeviceSplitCandidate {
-  float loss_chg {-FLT_MAX};
-  DefaultDirection dir {kLeftDir};
+  float loss_chg{-std::numeric_limits<float>::max()};
+  DefaultDirection dir{kLeftDir};
   int findex {-1};
   float fvalue {0};
   // categorical split, either it's the split category for OHE or the threshold for partition-based
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 5278b328acbc..19957857218d 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -19,6 +19,7 @@
 #include "../common/cuda_context.cuh"  // CUDAContext
 #include "../common/device_helpers.cuh"
 #include "../common/hist_util.h"
+#include "../common/random.h"  // for ColumnSampler, GlobalRandom
 #include "../common/timer.h"
 #include "../data/ellpack_page.cuh"
 #include "../data/ellpack_page.h"

From a6a8a55ffab4eb518998c7f1d47bbff99086827c Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 16 Jul 2024 19:03:48 +0800
Subject: [PATCH 37/51] Merge approx tests. (#10583)

---
 tests/cpp/tree/test_approx.cc      | 49 +++++++++++++++++++++++++
 tests/cpp/tree/test_column_split.h |  8 +++-
 tests/cpp/tree/test_histmaker.cc   | 59 ------------------------------
 3 files changed, 55 insertions(+), 61 deletions(-)
 delete mode 100644 tests/cpp/tree/test_histmaker.cc

diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc
index 8f28bfa218c8..83e9243a2fa4 100644
--- a/tests/cpp/tree/test_approx.cc
+++ b/tests/cpp/tree/test_approx.cc
@@ -4,10 +4,12 @@
 #include <gtest/gtest.h>
 
 #include "../../../src/tree/common_row_partitioner.h"
+#include "../../../src/tree/param.h"    // for TrainParam
 #include "../collective/test_worker.h"  // for TestDistributedGlobal
 #include "../helpers.h"
 #include "test_column_split.h"  // for TestColumnSplit
 #include "test_partitioner.h"
+#include "xgboost/tree_model.h"  // for RegTree
 
 namespace xgboost::tree {
 namespace {
@@ -76,6 +78,53 @@ TEST(Approx, Partitioner) {
   }
 }
 
+TEST(Approx, InteractionConstraint) {
+  auto constexpr kRows = 32;
+  auto constexpr kCols = 16;
+  auto p_dmat = GenerateCatDMatrix(kRows, kCols, 0.6f, false);
+  Context ctx;
+
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+
+  ObjInfo task{ObjInfo::kRegression};
+  {
+    // With constraints
+    RegTree tree{1, kCols};
+
+    std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
+    TrainParam param;
+    param.UpdateAllowUnknown(
+        Args{{"interaction_constraints", "[[0, 1]]"}, {"num_feature", std::to_string(kCols)}});
+    std::vector<HostDeviceVector<bst_node_t>> position(1);
+    updater->Configure(Args{});
+    updater->Update(&param, &gpair, p_dmat.get(), position, {&tree});
+
+    ASSERT_EQ(tree.NumExtraNodes(), 4);
+    ASSERT_EQ(tree[0].SplitIndex(), 1);
+
+    ASSERT_EQ(tree[tree[0].LeftChild()].SplitIndex(), 0);
+    ASSERT_EQ(tree[tree[0].RightChild()].SplitIndex(), 0);
+  }
+  {
+    // Without constraints
+    RegTree tree{1u, kCols};
+
+    std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
+    std::vector<HostDeviceVector<bst_node_t>> position(1);
+    TrainParam param;
+    param.Init(Args{});
+    updater->Configure(Args{});
+    updater->Update(&param, &gpair, p_dmat.get(), position, {&tree});
+
+    ASSERT_EQ(tree.NumExtraNodes(), 10);
+    ASSERT_EQ(tree[0].SplitIndex(), 1);
+
+    ASSERT_NE(tree[tree[0].LeftChild()].SplitIndex(), 0);
+    ASSERT_NE(tree[tree[0].RightChild()].SplitIndex(), 0);
+  }
+}
+
 namespace {
 void TestColumnSplitPartitioner(size_t n_samples, size_t base_rowid, std::shared_ptr<DMatrix> Xy,
                                 std::vector<float>* hess, float min_value, float mid_value,
diff --git a/tests/cpp/tree/test_column_split.h b/tests/cpp/tree/test_column_split.h
index b03597f38681..eba452a15a1c 100644
--- a/tests/cpp/tree/test_column_split.h
+++ b/tests/cpp/tree/test_column_split.h
@@ -23,9 +23,13 @@ inline std::shared_ptr<DMatrix> GenerateCatDMatrix(std::size_t rows, std::size_t
     for (size_t i = 0; i < ft.size(); ++i) {
       ft[i] = (i % 3 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical;
     }
-    return RandomDataGenerator(rows, cols, 0.6f).Seed(3).Type(ft).MaxCategory(17).GenerateDMatrix();
+    return RandomDataGenerator(rows, cols, sparsity)
+        .Seed(3)
+        .Type(ft)
+        .MaxCategory(17)
+        .GenerateDMatrix();
   } else {
-    return RandomDataGenerator{rows, cols, 0.6f}.Seed(3).GenerateDMatrix();
+    return RandomDataGenerator{rows, cols, sparsity}.Seed(3).GenerateDMatrix();
   }
 }
 
diff --git a/tests/cpp/tree/test_histmaker.cc b/tests/cpp/tree/test_histmaker.cc
deleted file mode 100644
index 888790aa7c3c..000000000000
--- a/tests/cpp/tree/test_histmaker.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-/**
- * Copyright 2019-2024, XGBoost Contributors
- */
-#include <gtest/gtest.h>
-#include <xgboost/tree_model.h>
-#include <xgboost/tree_updater.h>
-
-#include "../../../src/tree/param.h"  // for TrainParam
-#include "../helpers.h"
-#include "test_column_split.h"  // for GenerateCatDMatrix
-
-namespace xgboost::tree {
-TEST(GrowHistMaker, InteractionConstraint) {
-  auto constexpr kRows = 32;
-  auto constexpr kCols = 16;
-  auto p_dmat = GenerateCatDMatrix(kRows, kCols, 0.0, false);
-  Context ctx;
-
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
-  gpair.Data()->Copy(GenerateRandomGradients(kRows));
-
-  ObjInfo task{ObjInfo::kRegression};
-  {
-    // With constraints
-    RegTree tree{1, kCols};
-
-    std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
-    TrainParam param;
-    param.UpdateAllowUnknown(
-        Args{{"interaction_constraints", "[[0, 1]]"}, {"num_feature", std::to_string(kCols)}});
-    std::vector<HostDeviceVector<bst_node_t>> position(1);
-    updater->Configure(Args{});
-    updater->Update(&param, &gpair, p_dmat.get(), position, {&tree});
-
-    ASSERT_EQ(tree.NumExtraNodes(), 4);
-    ASSERT_EQ(tree[0].SplitIndex(), 1);
-
-    ASSERT_EQ(tree[tree[0].LeftChild()].SplitIndex(), 0);
-    ASSERT_EQ(tree[tree[0].RightChild()].SplitIndex(), 0);
-  }
-  {
-    // Without constraints
-    RegTree tree{1u, kCols};
-
-    std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
-    std::vector<HostDeviceVector<bst_node_t>> position(1);
-    TrainParam param;
-    param.Init(Args{});
-    updater->Configure(Args{});
-    updater->Update(&param, &gpair, p_dmat.get(), position, {&tree});
-
-    ASSERT_EQ(tree.NumExtraNodes(), 10);
-    ASSERT_EQ(tree[0].SplitIndex(), 1);
-
-    ASSERT_NE(tree[tree[0].LeftChild()].SplitIndex(), 0);
-    ASSERT_NE(tree[tree[0].RightChild()].SplitIndex(), 0);
-  }
-}
-}  // namespace xgboost::tree

From ee8bb60bf1c581eaceb1c0afdc3861ea10d198fd Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Wed, 17 Jul 2024 06:21:17 -0700
Subject: [PATCH 38/51] [CI] Reduce the frequency of dependabot PRs (#10593)

---
 .github/dependabot.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 06badec5f2e2..1a8098071ba3 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -12,7 +12,7 @@ updates:
   - package-ecosystem: "maven"
     directory: "/jvm-packages/xgboost4j"
     schedule:
-      interval: "daily"
+      interval: "monthly"
   - package-ecosystem: "maven"
     directory: "/jvm-packages/xgboost4j-gpu"
     schedule:
@@ -24,7 +24,7 @@ updates:
   - package-ecosystem: "maven"
     directory: "/jvm-packages/xgboost4j-spark"
     schedule:
-      interval: "daily"
+      interval: "monthly"
   - package-ecosystem: "maven"
     directory: "/jvm-packages/xgboost4j-spark-gpu"
     schedule:

From c41a657c4e389e830e8e03e8e88f499faad07efd Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 18 Jul 2024 01:35:21 +0800
Subject: [PATCH 39/51] Bump actions/setup-python from 5.1.0 to 5.1.1 (#10599)

Bumps [actions/setup-python](https://github.com/actions/setup-python) from 5.1.0 to 5.1.1.
- [Release notes](https://github.com/actions/setup-python/releases)
- [Commits](https://github.com/actions/setup-python/compare/82c7e631bb3cdc910f68e0081d67478d79c6982d...39cd14951b08e74b54015e9e001cdefcf80e669f)

---
updated-dependencies:
- dependency-name: actions/setup-python
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/main.yml         | 2 +-
 .github/workflows/python_tests.yml | 2 +-
 .github/workflows/r_tests.yml      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 001e17b510a3..0408f358fba5 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -180,7 +180,7 @@ jobs:
     - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
-    - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
+    - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
       with:
         python-version: "3.8"
         architecture: 'x64'
diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index 83f0ad495fc3..e232cd754f7b 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -319,7 +319,7 @@ jobs:
           submodules: 'true'
 
       - name: Set up Python 3.8
-        uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
+        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
         with:
           python-version: 3.8
 
diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml
index 9fb9d4684ad1..4298fd9c6a5d 100644
--- a/.github/workflows/r_tests.yml
+++ b/.github/workflows/r_tests.yml
@@ -84,7 +84,7 @@ jobs:
         key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
         restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
 
-    - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
+    - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
       with:
         python-version: "3.8"
         architecture: 'x64'

From 919cfd9c8dd634b5ff4811bfd28684744734f13a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 18 Jul 2024 01:36:12 +0800
Subject: [PATCH 40/51] Bump actions/upload-artifact from 4.3.3 to 4.3.4
 (#10600)

Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.3.3 to 4.3.4.
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](https://github.com/actions/upload-artifact/compare/65462800fd760344b1a7b4382951275a0abb4808...0b2256b8c012f0828dc542b3febcab082c67f72b)

---
updated-dependencies:
- dependency-name: actions/upload-artifact
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/scorecards.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 222700da4a58..a108f282214f 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -41,7 +41,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
+        uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # v4.3.4
         with:
           name: SARIF file
           path: results.sarif

From 07732e02e55d662226f32190ba9c0abaae78cf61 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 18 Jul 2024 01:36:44 +0800
Subject: [PATCH 41/51] Bump com.fasterxml.jackson.core:jackson-databind
 (#10590)

Bumps [com.fasterxml.jackson.core:jackson-databind](https://github.com/FasterXML/jackson) from 2.15.2 to 2.17.2.
- [Commits](https://github.com/FasterXML/jackson/commits)

---
updated-dependencies:
- dependency-name: com.fasterxml.jackson.core:jackson-databind
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 3770ddba92c8..a154f2d489ae 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -37,7 +37,7 @@
         <junit.version>4.13.2</junit.version>
         <spark.version>3.5.1</spark.version>
         <spark.version.gpu>3.5.1</spark.version.gpu>
-        <fasterxml.jackson.version>2.15.2</fasterxml.jackson.version>
+        <fasterxml.jackson.version>2.17.2</fasterxml.jackson.version>
         <scala.version>2.12.18</scala.version>
         <scala.binary.version>2.12</scala.binary.version>
         <hadoop.version>3.4.0</hadoop.version>

From e9fbce9791469571481b59743bedf9d7d2a481a5 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 18 Jul 2024 03:33:01 +0800
Subject: [PATCH 42/51] Refactor `DeviceUVector`. (#10595)

Create a wrapper instead of using inheritance to avoid inconsistent interface of the class.
---
 src/common/device_helpers.cuh          |  2 +-
 src/common/device_vector.cuh           | 71 ++++++++++++++++----------
 src/common/host_device_vector.cu       | 27 +++++-----
 tests/cpp/common/test_device_vector.cu |  2 +-
 4 files changed, 58 insertions(+), 44 deletions(-)

diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 1754c9507036..98a76d72a263 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -510,7 +510,7 @@ xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
 
 template <typename T>
 xgboost::common::Span<T> ToSpan(DeviceUVector<T> &vec) {
-  return {thrust::raw_pointer_cast(vec.data()), vec.size()};
+  return {vec.data(), vec.size()};
 }
 
 // thrust begin, similiar to std::begin
diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
index 35386856cc9c..2587ce719780 100644
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@@ -284,47 +284,64 @@ class LoggingResource : public rmm::mr::device_memory_resource {
 
 LoggingResource *GlobalLoggingResource();
 
+#endif  // defined(XGBOOST_USE_RMM)
+
 /**
- * @brief Container class that doesn't initialize the data.
+ * @brief Container class that doesn't initialize the data when RMM is used.
  */
 template <typename T>
-class DeviceUVector : public rmm::device_uvector<T> {
-  using Super = rmm::device_uvector<T>;
+class DeviceUVector {
+ private:
+#if defined(XGBOOST_USE_RMM)
+  rmm::device_uvector<T> data_{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()};
+#else
+  ::dh::device_vector<T> data_;
+#endif  // defined(XGBOOST_USE_RMM)
 
  public:
-  DeviceUVector() : Super{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {}
+  using value_type = T;                        // NOLINT
+  using pointer = value_type *;                // NOLINT
+  using const_pointer = value_type const *;    // NOLINT
+  using reference = value_type &;              // NOLINT
+  using const_reference = value_type const &;  // NOLINT
 
-  void Resize(std::size_t n) { Super::resize(n, rmm::cuda_stream_per_thread); }
-  void Resize(std::size_t n, T const &v) {
+ public:
+  DeviceUVector() = default;
+  DeviceUVector(DeviceUVector const &that) = delete;
+  DeviceUVector &operator=(DeviceUVector const &that) = delete;
+  DeviceUVector(DeviceUVector &&that) = default;
+  DeviceUVector &operator=(DeviceUVector &&that) = default;
+
+  void resize(std::size_t n) {  // NOLINT
+#if defined(XGBOOST_USE_RMM)
+    data_.resize(n, rmm::cuda_stream_per_thread);
+#else
+    data_.resize(n);
+#endif
+  }
+  void resize(std::size_t n, T const &v) {         // NOLINT
+#if defined(XGBOOST_USE_RMM)
     auto orig = this->size();
-    Super::resize(n, rmm::cuda_stream_per_thread);
+    data_.resize(n, rmm::cuda_stream_per_thread);
     if (orig < n) {
       thrust::fill(rmm::exec_policy_nosync{}, this->begin() + orig, this->end(), v);
     }
+#else
+    data_.resize(n, v);
+#endif
   }
+  [[nodiscard]] std::size_t size() const { return data_.size(); }  // NOLINT
 
- private:
-  // undefined private, cannot be accessed.
-  void resize(std::size_t n, rmm::cuda_stream_view stream);  // NOLINT
-};
-
-#else
+  [[nodiscard]] auto begin() { return data_.begin(); }  // NOLINT
+  [[nodiscard]] auto end() { return data_.end(); }      // NOLINT
 
-/**
- * @brief Without RMM, the initialization will happen.
- */
-template <typename T>
-class DeviceUVector : public thrust::device_vector<T, XGBDeviceAllocator<T>> {
-  using Super = thrust::device_vector<T, XGBDeviceAllocator<T>>;
+  [[nodiscard]] auto begin() const { return this->cbegin(); }  // NOLINT
+  [[nodiscard]] auto end() const { return this->cend(); }      // NOLINT
 
- public:
-  void Resize(std::size_t n) { Super::resize(n); }
-  void Resize(std::size_t n, T const &v) { Super::resize(n, v); }
+  [[nodiscard]] auto cbegin() const { return data_.cbegin(); }  // NOLINT
+  [[nodiscard]] auto cend() const { return data_.cend(); }      // NOLINT
 
- private:
-  // undefined private, cannot be accessed.
-  void resize(std::size_t n, T const &v = T{});  // NOLINT
+  [[nodiscard]] auto data() { return thrust::raw_pointer_cast(data_.data()); }        // NOLINT
+  [[nodiscard]] auto data() const { return thrust::raw_pointer_cast(data_.data()); }  // NOLINT
 };
-
-#endif  // defined(XGBOOST_USE_RMM)
 }  // namespace dh
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index 16a1aa027f09..00055ec69a7e 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -29,7 +29,7 @@ class HostDeviceVectorImpl {
     if (device.IsCUDA()) {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
-      data_d_->Resize(size, v);
+      data_d_->resize(size, v);
     } else {
       data_h_.resize(size, v);
     }
@@ -67,12 +67,12 @@ class HostDeviceVectorImpl {
 
   T* DevicePointer() {
     LazySyncDevice(GPUAccess::kWrite);
-    return thrust::raw_pointer_cast(data_d_->data());
+    return data_d_->data();
   }
 
   const T* ConstDevicePointer() {
     LazySyncDevice(GPUAccess::kRead);
-    return thrust::raw_pointer_cast(data_d_->data());
+    return data_d_->data();
   }
 
   common::Span<T> DeviceSpan() {
@@ -181,7 +181,7 @@ class HostDeviceVectorImpl {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
       auto old_size = data_d_->size();
-      data_d_->Resize(new_size, std::forward<U>(args)...);
+      data_d_->resize(new_size, std::forward<U>(args)...);
     } else {
       // resize on host
       LazySyncHost(GPUAccess::kNone);
@@ -200,8 +200,8 @@ class HostDeviceVectorImpl {
     gpu_access_ = access;
     if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
     SetDevice();
-    dh::safe_cuda(cudaMemcpy(data_h_.data(), thrust::raw_pointer_cast(data_d_->data()),
-                             data_d_->size() * sizeof(T), cudaMemcpyDeviceToHost));
+    dh::safe_cuda(cudaMemcpy(data_h_.data(), data_d_->data(), data_d_->size() * sizeof(T),
+                             cudaMemcpyDeviceToHost));
   }
 
   void LazySyncDevice(GPUAccess access) {
@@ -214,9 +214,8 @@ class HostDeviceVectorImpl {
     // data is on the host
     LazyResizeDevice(data_h_.size());
     SetDevice();
-    dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), data_h_.data(),
-                                  data_d_->size() * sizeof(T), cudaMemcpyHostToDevice,
-                                  dh::DefaultStream()));
+    dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), data_h_.data(), data_d_->size() * sizeof(T),
+                                  cudaMemcpyHostToDevice, dh::DefaultStream()));
     gpu_access_ = access;
   }
 
@@ -241,8 +240,7 @@ class HostDeviceVectorImpl {
       LazyResizeDevice(Size());
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
-      dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()),
-                                    thrust::raw_pointer_cast(other->data_d_->data()),
+      dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), other->data_d_->data(),
                                     data_d_->size() * sizeof(T), cudaMemcpyDefault,
                                     dh::DefaultStream()));
     }
@@ -252,15 +250,14 @@ class HostDeviceVectorImpl {
     LazyResizeDevice(Size());
     gpu_access_ = GPUAccess::kWrite;
     SetDevice();
-    dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), begin,
-                                  data_d_->size() * sizeof(T), cudaMemcpyDefault,
-                                  dh::DefaultStream()));
+    dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), begin, data_d_->size() * sizeof(T),
+                                  cudaMemcpyDefault, dh::DefaultStream()));
   }
 
   void LazyResizeDevice(size_t new_size) {
     if (data_d_ && new_size == data_d_->size()) { return; }
     SetDevice();
-    data_d_->Resize(new_size);
+    data_d_->resize(new_size);
   }
 
   void SetDevice() {
diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu
index 95da4ef3f167..c6a8c0ab95ce 100644
--- a/tests/cpp/common/test_device_vector.cu
+++ b/tests/cpp/common/test_device_vector.cu
@@ -12,7 +12,7 @@ TEST(DeviceUVector, Basic) {
   std::int32_t verbosity{3};
   std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
   DeviceUVector<float> uvec;
-  uvec.Resize(12);
+  uvec.resize(12);
   auto peak = GlobalMemoryLogger().PeakMemory();
   auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size();
   ASSERT_EQ(peak, n_bytes);

From 292bb677e53358b42114d844975dce5dede25c1a Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 18 Jul 2024 08:20:21 +0800
Subject: [PATCH 43/51] [EM] Support mmap backed ellpack. (#10602)

- Support resource view in ellpack.
- Define the CUDA version of MMAP resource.
- Define the CUDA version of malloc resource.
- Refactor cuda runtime API wrappers, and add memory access related wrappers.
- gather windows macros into a single header.
---
 R-package/src/Makevars.in                     |   1 +
 R-package/src/Makevars.win                    |   1 +
 include/xgboost/base.h                        |   2 +
 include/xgboost/collective/poll_utils.h       |   7 +-
 include/xgboost/collective/socket.h           |  28 +---
 include/xgboost/windefs.h                     |  33 ++++
 src/c_api/c_api_error.h                       |   5 +-
 src/cli_main.cc                               |  17 +-
 src/collective/tracker.cc                     |   7 +-
 src/common/common.cc                          |   7 +-
 src/common/common.cu                          |  40 ++---
 src/common/common.h                           |  38 ++---
 src/common/cuda_rt_utils.cc                   |  86 ++++++++++
 src/common/cuda_rt_utils.h                    |  21 +++
 src/common/device_helpers.cuh                 |  26 +--
 src/common/io.cc                              | 132 ++++++---------
 src/common/io.h                               |  75 ++++++++-
 src/common/ref_resource_view.cuh              |  26 +++
 src/common/ref_resource_view.h                |  18 +--
 src/common/resource.cu                        |  43 +++++
 src/common/resource.cuh                       |  54 +++++++
 src/context.cc                                |   7 +-
 src/data/ellpack_page.cu                      | 150 ++++++++----------
 src/data/ellpack_page.cuh                     | 101 +++++++-----
 src/data/ellpack_page_raw_format.cu           |  97 ++++++-----
 src/data/ellpack_page_raw_format.h            |   7 +-
 src/data/ellpack_page_source.cu               |  56 +++++--
 src/data/ellpack_page_source.h                |  45 +++++-
 src/data/gradient_index.cu                    |   6 +-
 src/data/histogram_cut_format.h               |  49 ------
 src/data/iterative_dmatrix.cu                 |  15 +-
 src/data/sparse_page_source.h                 |   2 +-
 src/gbm/gblinear.cc                           |   4 +-
 src/gbm/gbtree.cc                             |   8 +-
 src/learner.cc                                |   2 +-
 src/predictor/gpu_predictor.cu                |   1 +
 src/tree/fit_stump.cu                         |   3 -
 src/tree/fit_stump.h                          |  16 +-
 src/tree/gpu_hist/gradient_based_sampler.cu   |  22 +--
 src/tree/gpu_hist/gradient_based_sampler.cuh  |  22 ++-
 src/tree/updater_gpu_hist.cu                  |  23 ++-
 tests/cpp/collective/test_worker.h            |   2 +-
 tests/cpp/common/test_host_device_vector.cu   |   3 +-
 tests/cpp/common/test_ref_resource_view.cc    |  11 +-
 tests/cpp/data/test_ellpack_page.cu           |  63 ++++----
 .../cpp/data/test_ellpack_page_raw_format.cu  |  25 ++-
 tests/cpp/data/test_iterative_dmatrix.cu      |  37 ++---
 tests/cpp/data/test_sparse_page_dmatrix.cu    |  25 +--
 tests/cpp/filesystem.h                        |   9 +-
 tests/cpp/helpers.h                           |   5 +-
 tests/cpp/histogram_helpers.h                 |   4 +-
 tests/cpp/objective/test_aft_obj.cc           |   3 +-
 .../plugin/federated/test_federated_coll.cu   |   2 +-
 .../federated/test_federated_comm_group.cc    |   4 +-
 .../federated/test_federated_comm_group.cu    |   3 +-
 tests/cpp/test_context.cu                     |   4 +-
 .../gpu_hist/test_gradient_based_sampler.cu   |  16 +-
 tests/cpp/tree/gpu_hist/test_histogram.cu     |   3 +-
 tests/cpp/tree/test_gpu_hist.cu               |   9 +-
 59 files changed, 887 insertions(+), 644 deletions(-)
 create mode 100644 include/xgboost/windefs.h
 create mode 100644 src/common/cuda_rt_utils.cc
 create mode 100644 src/common/cuda_rt_utils.h
 create mode 100644 src/common/ref_resource_view.cuh
 create mode 100644 src/common/resource.cu
 create mode 100644 src/common/resource.cuh
 delete mode 100644 src/data/histogram_cut_format.h

diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index 5fbf479c5640..ed4b38f99ea7 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -113,6 +113,7 @@ OBJECTS= \
     $(PKGROOT)/src/common/charconv.o \
     $(PKGROOT)/src/common/column_matrix.o \
     $(PKGROOT)/src/common/common.o \
+    $(PKGROOT)/src/common/cuda_rt_utils.o \
     $(PKGROOT)/src/common/error_msg.o \
     $(PKGROOT)/src/common/hist_util.o \
     $(PKGROOT)/src/common/host_device_vector.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index a5a5c131edf3..d4330120f890 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -113,6 +113,7 @@ OBJECTS= \
     $(PKGROOT)/src/common/charconv.o \
     $(PKGROOT)/src/common/column_matrix.o \
     $(PKGROOT)/src/common/common.o \
+    $(PKGROOT)/src/common/cuda_rt_utils.o \
     $(PKGROOT)/src/common/error_msg.o \
     $(PKGROOT)/src/common/hist_util.o \
     $(PKGROOT)/src/common/host_device_vector.o \
diff --git a/include/xgboost/base.h b/include/xgboost/base.h
index 9abe72b87859..64aab5c41b0c 100644
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -7,6 +7,8 @@
 #define XGBOOST_BASE_H_
 
 #include <dmlc/omp.h>  // for omp_uint, omp_ulong
+// Put the windefs here to guard as many files as possible.
+#include <xgboost/windefs.h>
 
 #include <cstdint>  // for int32_t, uint64_t, int16_t
 #include <ostream>  // for ostream
diff --git a/include/xgboost/collective/poll_utils.h b/include/xgboost/collective/poll_utils.h
index a4d2fbacda27..41b674964efc 100644
--- a/include/xgboost/collective/poll_utils.h
+++ b/include/xgboost/collective/poll_utils.h
@@ -4,13 +4,14 @@
  * \author Tianqi Chen
  */
 #pragma once
-#include "xgboost/collective/result.h"
-#include "xgboost/collective/socket.h"
+#include <xgboost/collective/result.h>
+#include <xgboost/collective/socket.h>
 
 #if defined(_WIN32)
+#include <xgboost/windefs.h>
+// Socket API
 #include <winsock2.h>
 #include <ws2tcpip.h>
-
 #else
 
 #include <arpa/inet.h>
diff --git a/include/xgboost/collective/socket.h b/include/xgboost/collective/socket.h
index c5dd977f6255..bf5fffdaf155 100644
--- a/include/xgboost/collective/socket.h
+++ b/include/xgboost/collective/socket.h
@@ -1,12 +1,8 @@
 /**
- * Copyright (c) 2022-2024, XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  */
 #pragma once
 
-#if !defined(NOMINMAX) && defined(_WIN32)
-#define NOMINMAX
-#endif                   // !defined(NOMINMAX)
-
 #include <cerrno>        // errno, EINTR, EBADF
 #include <climits>       // HOST_NAME_MAX
 #include <cstddef>       // std::size_t
@@ -18,18 +14,12 @@
 
 #if defined(__linux__)
 #include <sys/ioctl.h>  // for TIOCOUTQ, FIONREAD
-#endif  // defined(__linux__)
-
-#if !defined(xgboost_IS_MINGW)
-
-#if defined(__MINGW32__)
-#define xgboost_IS_MINGW 1
-#endif  // defined(__MINGW32__)
-
-#endif  // xgboost_IS_MINGW
+#endif                  // defined(__linux__)
 
 #if defined(_WIN32)
-
+// Guard the include.
+#include <xgboost/windefs.h>
+// Socket API
 #include <winsock2.h>
 #include <ws2tcpip.h>
 
@@ -41,9 +31,9 @@ using in_port_t = std::uint16_t;
 
 #if !defined(xgboost_IS_MINGW)
 using ssize_t = int;
-#endif                    // !xgboost_IS_MINGW()
+#endif  // !xgboost_IS_MINGW()
 
-#else                     // UNIX
+#else  // UNIX
 
 #include <arpa/inet.h>    // inet_ntop
 #include <fcntl.h>        // fcntl, F_GETFL, O_NONBLOCK
@@ -839,7 +829,3 @@ Result INetNToP(H const &host, std::string *p_out) {
 }  // namespace xgboost
 
 #undef xgboost_CHECK_SYS_CALL
-
-#if defined(xgboost_IS_MINGW)
-#undef xgboost_IS_MINGW
-#endif
diff --git a/include/xgboost/windefs.h b/include/xgboost/windefs.h
new file mode 100644
index 000000000000..e7e743184a17
--- /dev/null
+++ b/include/xgboost/windefs.h
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2024, XGBoost Contributors
+ *
+ * @brief Macro for Windows.
+ */
+#pragma once
+
+#if !defined(xgboost_IS_WIN)
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#define xgboost_IS_WIN 1
+#endif  // defined(_MSC_VER) || defined(__MINGW32__)
+
+#endif  // !defined(xgboost_IS_WIN)
+
+#if defined(xgboost_IS_WIN)
+
+#if !defined(NOMINMAX)
+#define NOMINMAX
+#endif  // !defined(NOMINMAX)
+
+// A macro used inside `windows.h` to avoid conflicts with `winsock2.h`
+#define WIN32_LEAN_AND_MEAN
+
+#if !defined(xgboost_IS_MINGW)
+
+#if defined(__MINGW32__)
+#define xgboost_IS_MINGW 1
+#endif  // defined(__MINGW32__)
+
+#endif  // xgboost_IS_MINGW
+
+#endif  // defined(xgboost_IS_WIN)
diff --git a/src/c_api/c_api_error.h b/src/c_api/c_api_error.h
index 0ad4ac073dbd..a1928e6b14d7 100644
--- a/src/c_api/c_api_error.h
+++ b/src/c_api/c_api_error.h
@@ -7,10 +7,9 @@
 #define XGBOOST_C_API_C_API_ERROR_H_
 
 #include <dmlc/base.h>
-#include <dmlc/logging.h>
 
-#include "c_api_utils.h"
-#include "xgboost/collective/result.h"
+#include "c_api_utils.h"  // for XGBoostAPIGuard
+#include "xgboost/logging.h"
 
 /*! \brief  macro to guard beginning and end section of all functions */
 #ifdef LOG_CAPI_INVOCATION
diff --git a/src/cli_main.cc b/src/cli_main.cc
index 54a3450276f4..1c388cf845c2 100644
--- a/src/cli_main.cc
+++ b/src/cli_main.cc
@@ -4,29 +4,26 @@
  * \brief The command line interface program of xgboost.
  *  This file is not included in dynamic library.
  */
-#if !defined(NOMINMAX) && defined(_WIN32)
-#define NOMINMAX
-#endif  // !defined(NOMINMAX)
-
 #include <dmlc/timer.h>
-
-#include <xgboost/learner.h>
+#include <xgboost/base.h>
 #include <xgboost/data.h>
 #include <xgboost/json.h>
+#include <xgboost/learner.h>
 #include <xgboost/logging.h>
 #include <xgboost/parameter.h>
 
-#include <iomanip>
-#include <ctime>
-#include <string>
 #include <cstdio>
 #include <cstring>
+#include <ctime>
+#include <iomanip>
+#include <string>
 #include <vector>
+
+#include "c_api/c_api_utils.h"
 #include "common/common.h"
 #include "common/config.h"
 #include "common/io.h"
 #include "common/version.h"
-#include "c_api/c_api_utils.h"
 
 namespace xgboost {
 enum CLITask {
diff --git a/src/collective/tracker.cc b/src/collective/tracker.cc
index 6cb3601db7f4..bbc7a7c5a730 100644
--- a/src/collective/tracker.cc
+++ b/src/collective/tracker.cc
@@ -7,11 +7,10 @@
 #include <sys/socket.h>  // socket, AF_INET6, AF_INET, connect, getsockname
 #endif                   // defined(__unix__) || defined(__APPLE__)
 
-#if !defined(NOMINMAX) && defined(_WIN32)
-#define NOMINMAX
-#endif  // !defined(NOMINMAX)
-
 #if defined(_WIN32)
+// Guard the include
+#include <xgboost/windefs.h>
+// Socket API
 #include <winsock2.h>
 #include <ws2tcpip.h>
 #endif  // defined(_WIN32)
diff --git a/src/common/common.cc b/src/common/common.cc
index 086f4c00d167..10a667070da9 100644
--- a/src/common/common.cc
+++ b/src/common/common.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023 by Contributors
+ * Copyright 2015-2024, XGBoost Contributors
  */
 #include "common.h"
 
@@ -54,9 +54,4 @@ void EscapeU8(std::string const &string, std::string *p_buffer) {
     }
   }
 }
-
-#if !defined(XGBOOST_USE_CUDA)
-int AllVisibleGPUs() { return 0; }
-#endif  // !defined(XGBOOST_USE_CUDA)
-
 }  // namespace xgboost::common
diff --git a/src/common/common.cu b/src/common/common.cu
index b6965904a2b0..958f93779308 100644
--- a/src/common/common.cu
+++ b/src/common/common.cu
@@ -1,29 +1,21 @@
-/*!
- * Copyright 2018-2022 XGBoost contributors
+/**
+ * Copyright 2018-2024, XGBoost contributors
  */
-#include "common.h"
-
-namespace xgboost {
-namespace common {
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
 
-void SetDevice(std::int32_t device) {
-  if (device >= 0) {
-    dh::safe_cuda(cudaSetDevice(device));
-  }
-}
+#include "common.h"
 
-int AllVisibleGPUs() {
-  int n_visgpus = 0;
-  try {
-    // When compiled with CUDA but running on CPU only device,
-    // cudaGetDeviceCount will fail.
-    dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
-  } catch (const dmlc::Error &) {
-    cudaGetLastError();  // reset error.
-    return 0;
+namespace dh {
+void ThrowOnCudaError(cudaError_t code, const char *file, int line) {
+  if (code != cudaSuccess) {
+    std::string f;
+    if (file != nullptr) {
+      f = file;
+    }
+    LOG(FATAL) << thrust::system_error(code, thrust::cuda_category(),
+                                       f + ": " + std::to_string(line))
+                      .what();
   }
-  return n_visgpus;
 }
-
-}  // namespace common
-}  // namespace xgboost
+}  // namespace dh
diff --git a/src/common/common.h b/src/common/common.h
index 950dee5210b1..93151670b7be 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023 by XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
  * \file common.h
  * \brief Common utilities
  */
@@ -19,9 +19,8 @@
 #include "xgboost/base.h"     // for XGBOOST_DEVICE
 #include "xgboost/logging.h"  // for LOG, LOG_FATAL, LogMessageFatal
 
+// magic to define functions based on the compiler.
 #if defined(__CUDACC__)
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
 
 #define WITH_CUDA() true
 
@@ -31,23 +30,20 @@
 
 #endif  // defined(__CUDACC__)
 
+#if defined(XGBOOST_USE_CUDA)
+#include <cuda_runtime_api.h>
+#endif
+
 namespace dh {
-#if defined(__CUDACC__)
+#if defined(XGBOOST_USE_CUDA)
 /*
- * Error handling  functions
+ * Error handling functions
  */
+void ThrowOnCudaError(cudaError_t code, const char *file, int line);
+
 #define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__)
 
-inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
-                                    int line) {
-  if (code != cudaSuccess) {
-    LOG(FATAL) << thrust::system_error(code, thrust::cuda_category(),
-                                       std::string{file} + ": " +  // NOLINT
-                                       std::to_string(line)).what();
-  }
-  return code;
-}
-#endif  // defined(__CUDACC__)
+#endif  // defined(XGBOOST_USE_CUDA)
 }  // namespace dh
 
 namespace xgboost::common {
@@ -167,8 +163,6 @@ class Range {
   Iterator end_;
 };
 
-int AllVisibleGPUs();
-
 inline void AssertGPUSupport() {
 #ifndef XGBOOST_USE_CUDA
     LOG(FATAL) << "XGBoost version not compiled with GPU support.";
@@ -187,16 +181,6 @@ inline void AssertSYCLSupport() {
 #endif  // XGBOOST_USE_SYCL
 }
 
-void SetDevice(std::int32_t device);
-
-#if !defined(XGBOOST_USE_CUDA)
-inline void SetDevice(std::int32_t device) {
-  if (device >= 0) {
-    AssertGPUSupport();
-  }
-}
-#endif
-
 /**
  * @brief Last index of a group in a CSR style of index pointer.
  */
diff --git a/src/common/cuda_rt_utils.cc b/src/common/cuda_rt_utils.cc
new file mode 100644
index 000000000000..d41981d8fb18
--- /dev/null
+++ b/src/common/cuda_rt_utils.cc
@@ -0,0 +1,86 @@
+/**
+ * Copyright 2015-2024, XGBoost Contributors
+ */
+#include "cuda_rt_utils.h"
+
+#if defined(XGBOOST_USE_CUDA)
+#include <cuda_runtime_api.h>
+#endif  // defined(XGBOOST_USE_CUDA)
+
+#include <cstdint>  // for int32_t
+
+#include "common.h"  // for safe_cuda
+
+namespace xgboost::common {
+#if defined(XGBOOST_USE_CUDA)
+std::int32_t AllVisibleGPUs() {
+  int n_visgpus = 0;
+  try {
+    // When compiled with CUDA but running on CPU only device,
+    // cudaGetDeviceCount will fail.
+    dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
+  } catch (const dmlc::Error &) {
+    cudaGetLastError();  // reset error.
+    return 0;
+  }
+  return n_visgpus;
+}
+
+std::int32_t CurrentDevice() {
+  std::int32_t device = 0;
+  dh::safe_cuda(cudaGetDevice(&device));
+  return device;
+}
+
+// alternatively: `nvidia-smi -q | grep Addressing`
+bool SupportsPageableMem() {
+  std::int32_t res{0};
+  dh::safe_cuda(cudaDeviceGetAttribute(&res, cudaDevAttrPageableMemoryAccess, CurrentDevice()));
+  return res == 1;
+}
+
+bool SupportsAts() {
+  std::int32_t res{0};
+  dh::safe_cuda(cudaDeviceGetAttribute(&res, cudaDevAttrPageableMemoryAccessUsesHostPageTables,
+                                       CurrentDevice()));
+  return res == 1;
+}
+
+void CheckComputeCapability() {
+  for (std::int32_t d_idx = 0; d_idx < AllVisibleGPUs(); ++d_idx) {
+    cudaDeviceProp prop;
+    dh::safe_cuda(cudaGetDeviceProperties(&prop, d_idx));
+    std::ostringstream oss;
+    oss << "CUDA Capability Major/Minor version number: " << prop.major << "." << prop.minor
+        << " is insufficient.  Need >=3.5";
+    int failed = prop.major < 3 || (prop.major == 3 && prop.minor < 5);
+    if (failed) LOG(WARNING) << oss.str() << " for device: " << d_idx;
+  }
+}
+
+void SetDevice(std::int32_t device) {
+  if (device >= 0) {
+    dh::safe_cuda(cudaSetDevice(device));
+  }
+}
+#else
+std::int32_t AllVisibleGPUs() { return 0; }
+
+std::int32_t CurrentDevice() {
+  AssertGPUSupport();
+  return -1;
+}
+
+bool SupportsPageableMem() { return false; }
+
+bool SupportsAts() { return false; }
+
+void CheckComputeCapability() {}
+
+void SetDevice(std::int32_t device) {
+  if (device >= 0) {
+    AssertGPUSupport();
+  }
+}
+#endif  // !defined(XGBOOST_USE_CUDA)
+}  // namespace xgboost::common
diff --git a/src/common/cuda_rt_utils.h b/src/common/cuda_rt_utils.h
new file mode 100644
index 000000000000..fa14f8434970
--- /dev/null
+++ b/src/common/cuda_rt_utils.h
@@ -0,0 +1,21 @@
+/**
+ * Copyright 2024, XGBoost contributors
+ */
+#pragma once
+#include <cstdint>  // for int32_t
+namespace xgboost::common {
+std::int32_t AllVisibleGPUs();
+
+std::int32_t CurrentDevice();
+
+// Whether the device supports coherently accessing pageable memory without calling
+// `cudaHostRegister` on it
+bool SupportsPageableMem();
+
+// Address Translation Service (ATS)
+bool SupportsAts();
+
+void CheckComputeCapability();
+
+void SetDevice(std::int32_t device);
+}  // namespace xgboost::common
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 98a76d72a263..34faa4eb013f 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -157,18 +157,6 @@ inline size_t MaxSharedMemoryOptin(int device_idx) {
   return static_cast<std::size_t>(max_shared_memory);
 }
 
-inline void CheckComputeCapability() {
-  for (int d_idx = 0; d_idx < xgboost::common::AllVisibleGPUs(); ++d_idx) {
-    cudaDeviceProp prop;
-    safe_cuda(cudaGetDeviceProperties(&prop, d_idx));
-    std::ostringstream oss;
-    oss << "CUDA Capability Major/Minor version number: " << prop.major << "."
-        << prop.minor << " is insufficient.  Need >=3.5";
-    int failed = prop.major < 3 || (prop.major == 3 && prop.minor < 5);
-    if (failed) LOG(WARNING) << oss.str() << " for device: " << d_idx;
-  }
-}
-
 XGBOOST_DEV_INLINE void AtomicOrByte(unsigned int *__restrict__ buffer,
                                      size_t ibyte, unsigned char b) {
   atomicOr(&buffer[ibyte / sizeof(unsigned int)],
@@ -273,13 +261,15 @@ void Iota(Container array, cudaStream_t stream) {
 }
 
 // dh::DebugSyncDevice(__FILE__, __LINE__);
-inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
-  if (file != "" && line != -1) {
-    auto rank = xgboost::collective::GetRank();
-    LOG(DEBUG) << "R:" << rank << ": " << file << ":" << line;
+inline void DebugSyncDevice(char const *file = __builtin_FILE(), int32_t line = __builtin_LINE()) {
+  {
+    auto err = cudaDeviceSynchronize();
+    ThrowOnCudaError(err, file, line);
+  }
+  {
+    auto err = cudaGetLastError();
+    ThrowOnCudaError(err, file, line);
   }
-  safe_cuda(cudaDeviceSynchronize());
-  safe_cuda(cudaGetLastError());
 }
 
 // Faster to instantiate than caching_device_vector and invokes no synchronisation
diff --git a/src/common/io.cc b/src/common/io.cc
index 1715669b091a..4bc8d9de4f53 100644
--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -1,26 +1,21 @@
 /**
- * Copyright 2019-2023, by XGBoost Contributors
+ * Copyright 2019-2024, by XGBoost Contributors
  */
-#if !defined(NOMINMAX) && defined(_WIN32)
-#define NOMINMAX
-#endif  // !defined(NOMINMAX)
+#if defined(__unix__) || defined(__APPLE__)
 
-#if !defined(xgboost_IS_WIN)
+#include <fcntl.h>     // for open, O_RDONLY
+#include <sys/mman.h>  // for mmap, mmap64, munmap, madvise
+#include <unistd.h>    // for close, getpagesize
 
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#define xgboost_IS_WIN 1
-#endif  // defined(_MSC_VER) || defined(__MINGW32__)
+#else
 
-#endif  // !defined(xgboost_IS_WIN)
+#include <xgboost/windefs.h>
 
-#if defined(__unix__) || defined(__APPLE__)
-#include <fcntl.h>     // for open, O_RDONLY
-#include <sys/mman.h>  // for mmap, mmap64, munmap
-#include <unistd.h>    // for close, getpagesize
-#elif defined(xgboost_IS_WIN)
-#define WIN32_LEAN_AND_MEAN
+#if defined(xgboost_IS_WIN)
 #include <windows.h>
-#endif  // defined(__unix__)
+#endif  // defined(xgboost_IS_WIN)
+
+#endif  // defined(__unix__) || defined(__APPLE__)
 
 #include <algorithm>     // for copy, transform
 #include <cctype>        // for tolower
@@ -31,8 +26,7 @@
 #include <filesystem>    // for filesystem, weakly_canonical
 #include <fstream>       // for ifstream
 #include <iterator>      // for distance
-#include <limits>        // for numeric_limits
-#include <memory>        // for unique_ptr
+#include <memory>        // for unique_ptr, make_unique
 #include <string>        // for string
 #include <system_error>  // for error_code, system_category
 #include <utility>       // for move
@@ -40,7 +34,12 @@
 
 #include "io.h"
 #include "xgboost/collective/socket.h"  // for LastError
-#include "xgboost/logging.h"
+#include "xgboost/logging.h"            // for CHECK_LE
+#include "xgboost/string_view.h"        // for StringView
+
+#if !defined(__linux__) && !defined(__GLIBC__) && !defined(xgboost_IS_WIN)
+#include <limits>  // for numeric_limits
+#endif
 
 namespace xgboost::common {
 size_t PeekableInStream::Read(void* dptr, size_t size) {
@@ -182,39 +181,9 @@ std::string FileExtension(std::string fname, bool lower) {
 // NVCC 11.8 doesn't allow `noexcept(false) = default` altogether.
 ResourceHandler::~ResourceHandler() noexcept(false) {}  // NOLINT
 
-struct MMAPFile {
-#if defined(xgboost_IS_WIN)
-  HANDLE fd{INVALID_HANDLE_VALUE};
-  HANDLE file_map{INVALID_HANDLE_VALUE};
-#else
-  std::int32_t fd{0};
-#endif
-  std::byte* base_ptr{nullptr};
-  std::size_t base_size{0};
-  std::size_t delta{0};
-  std::string path;
-
-  MMAPFile() = default;
-
-#if defined(xgboost_IS_WIN)
-  MMAPFile(HANDLE fd, HANDLE fm, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
-           std::string path)
-      : fd{fd},
-        file_map{fm},
-        base_ptr{base_ptr},
-        base_size{base_size},
-        delta{delta},
-        path{std::move(path)} {}
-#else
-  MMAPFile(std::int32_t fd, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
-           std::string path)
-      : fd{fd}, base_ptr{base_ptr}, base_size{base_size}, delta{delta}, path{std::move(path)} {}
-#endif
-};
-
-std::unique_ptr<MMAPFile> Open(std::string path, std::size_t offset, std::size_t length) {
+MMAPFile* detail::OpenMmap(std::string path, std::size_t offset, std::size_t length) {
   if (length == 0) {
-    return std::make_unique<MMAPFile>();
+    return new MMAPFile{};
   }
 
 #if defined(xgboost_IS_WIN)
@@ -234,10 +203,8 @@ std::unique_ptr<MMAPFile> Open(std::string path, std::size_t offset, std::size_t
 #if defined(__linux__) || defined(__GLIBC__)
   int prot{PROT_READ};
   ptr = reinterpret_cast<std::byte*>(mmap64(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
-  madvise(ptr, view_size, MADV_WILLNEED);
   CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  auto handle =
-      std::make_unique<MMAPFile>(fd, ptr, view_size, offset - view_start, std::move(path));
+  auto handle = new MMAPFile{fd, ptr, view_size, offset - view_start, std::move(path)};
 #elif defined(xgboost_IS_WIN)
   auto file_size = GetFileSize(fd, nullptr);
   DWORD access = PAGE_READONLY;
@@ -248,55 +215,62 @@ std::unique_ptr<MMAPFile> Open(std::string path, std::size_t offset, std::size_t
   CHECK(map_file) << "Failed to map: " << path << ". " << SystemErrorMsg();
   ptr = reinterpret_cast<std::byte*>(MapViewOfFile(map_file, access, hoff, loff, view_size));
   CHECK_NE(ptr, nullptr) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  auto handle = std::make_unique<MMAPFile>(fd, map_file, ptr, view_size, offset - view_start,
-                                           std::move(path));
+  auto handle = new MMAPFile{fd, map_file, ptr, view_size, offset - view_start, std::move(path)};
 #else
   CHECK_LE(offset, std::numeric_limits<off_t>::max())
       << "File size has exceeded the limit on the current system.";
   int prot{PROT_READ};
   ptr = reinterpret_cast<std::byte*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
   CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  auto handle =
-      std::make_unique<MMAPFile>(fd, ptr, view_size, offset - view_start, std::move(path));
-#endif  // defined(__linux__)
+  auto handle = new MMAPFile{fd, ptr, view_size, offset - view_start, std::move(path)};
+#endif  // defined(__linux__) || defined(__GLIBC__)
 
   return handle;
 }
 
-MmapResource::MmapResource(std::string path, std::size_t offset, std::size_t length)
-    : ResourceHandler{kMmap}, handle_{Open(std::move(path), offset, length)}, n_{length} {}
-
-MmapResource::~MmapResource() noexcept(false) {
-  if (!handle_) {
+void detail::CloseMmap(MMAPFile* handle) {
+  if (!handle) {
     return;
   }
 #if defined(xgboost_IS_WIN)
-  if (handle_->base_ptr) {
-    CHECK(UnmapViewOfFile(handle_->base_ptr)) "Faled to call munmap: " << SystemErrorMsg();
+  if (handle->base_ptr) {
+    CHECK(UnmapViewOfFile(handle->base_ptr)) "Faled to call munmap: " << SystemErrorMsg();
   }
-  if (handle_->fd != INVALID_HANDLE_VALUE) {
-    CHECK(CloseHandle(handle_->fd)) << "Failed to close handle: " << SystemErrorMsg();
+  if (handle->fd != INVALID_HANDLE_VALUE) {
+    CHECK(CloseHandle(handle->fd)) << "Failed to close handle: " << SystemErrorMsg();
   }
-  if (handle_->file_map != INVALID_HANDLE_VALUE) {
-    CHECK(CloseHandle(handle_->file_map)) << "Failed to close mapping object: " << SystemErrorMsg();
+  if (handle->file_map != INVALID_HANDLE_VALUE) {
+    CHECK(CloseHandle(handle->file_map)) << "Failed to close mapping object: " << SystemErrorMsg();
   }
 #else
-  if (handle_->base_ptr) {
-    CHECK_NE(munmap(handle_->base_ptr, handle_->base_size), -1)
-        << "Faled to call munmap: " << handle_->path << ". " << SystemErrorMsg();
+  if (handle->base_ptr) {
+    CHECK_NE(munmap(handle->base_ptr, handle->base_size), -1)
+        << "Faled to call munmap: `" << handle->path << "`. " << SystemErrorMsg();
   }
-  if (handle_->fd != 0) {
-    CHECK_NE(close(handle_->fd), -1)
-        << "Faled to close: " << handle_->path << ". " << SystemErrorMsg();
+  if (handle->fd != 0) {
+    CHECK_NE(close(handle->fd), -1)
+        << "Faled to close: `" << handle->path << "`. " << SystemErrorMsg();
   }
 #endif
+  delete handle;
 }
 
+MmapResource::MmapResource(StringView path, std::size_t offset, std::size_t length)
+    : ResourceHandler{kMmap},
+      handle_{detail::OpenMmap(std::string{path}, offset, length), detail::CloseMmap},
+      n_{length} {
+#if defined(__unix__) || defined(__APPLE__)
+  madvise(handle_->base_ptr, handle_->base_size, MADV_WILLNEED);
+#endif  // defined(__unix__) || defined(__APPLE__)
+}
+
+MmapResource::~MmapResource() noexcept(false) = default;
+
 [[nodiscard]] void* MmapResource::Data() {
   if (!handle_) {
     return nullptr;
   }
-  return handle_->base_ptr + handle_->delta;
+  return this->handle_->Data();
 }
 
 [[nodiscard]] std::size_t MmapResource::Size() const { return n_; }
@@ -329,7 +303,3 @@ AlignedMemWriteStream::~AlignedMemWriteStream() = default;
   return this->pimpl_->Tell();
 }
 }  // namespace xgboost::common
-
-#if defined(xgboost_IS_WIN)
-#undef xgboost_IS_WIN
-#endif  // defined(xgboost_IS_WIN)
diff --git a/src/common/io.h b/src/common/io.h
index 198ce70143be..5f2e2833625a 100644
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -7,7 +7,11 @@
 #ifndef XGBOOST_COMMON_IO_H_
 #define XGBOOST_COMMON_IO_H_
 
-#include <dmlc/io.h>
+#include <xgboost/windefs.h>
+
+#if defined(xgboost_IS_WIN)
+#include <windows.h>
+#endif  // defined(xgboost_IS_WIN)
 
 #include <algorithm>    // for min, fill_n, copy_n
 #include <array>        // for array
@@ -15,6 +19,7 @@
 #include <cstdlib>      // for malloc, realloc, free
 #include <cstring>      // for memcpy
 #include <fstream>      // for ifstream
+#include <functional>   // for function
 #include <limits>       // for numeric_limits
 #include <memory>       // for unique_ptr
 #include <string>       // for string
@@ -23,6 +28,7 @@
 #include <vector>       // for vector
 
 #include "common.h"               // for DivRoundUp
+#include "dmlc/io.h"              // for SeekStream
 #include "xgboost/string_view.h"  // for StringView
 
 namespace xgboost::common {
@@ -224,7 +230,48 @@ inline std::string ReadAll(std::string const &path) {
   return content;
 }
 
-struct MMAPFile;
+/**
+ * @brief A handle to mmap file.
+ */
+struct MMAPFile {
+#if defined(xgboost_IS_WIN)
+  HANDLE fd{INVALID_HANDLE_VALUE};
+  HANDLE file_map{INVALID_HANDLE_VALUE};
+#else
+  std::int32_t fd{0};
+#endif  // defined(xgboost_IS_WIN)
+  std::byte* base_ptr{nullptr};
+  std::size_t base_size{0};
+  std::size_t delta{0};
+  std::string path;
+
+  MMAPFile() = default;
+
+#if defined(xgboost_IS_WIN)
+  MMAPFile(HANDLE fd, HANDLE fm, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
+           std::string path)
+      : fd{fd},
+        file_map{fm},
+        base_ptr{base_ptr},
+        base_size{base_size},
+        delta{delta},
+        path{std::move(path)} {}
+#else
+  MMAPFile(std::int32_t fd, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
+           std::string path)
+      : fd{fd}, base_ptr{base_ptr}, base_size{base_size}, delta{delta}, path{std::move(path)} {}
+#endif  // defined(xgboost_IS_WIN)
+
+  void const* Data() const { return this->base_ptr + this->delta; }
+  void* Data() { return this->base_ptr + this->delta; }
+};
+
+namespace detail {
+// call mmap
+[[nodiscard]] MMAPFile* OpenMmap(std::string path, std::size_t offset, std::size_t length);
+// close the mapped file handle.
+void CloseMmap(MMAPFile* handle);
+}  // namespace detail
 
 /**
  * @brief Handler for one-shot resource. Unlike `std::pmr::*`, the resource handler is
@@ -237,6 +284,8 @@ class ResourceHandler {
   enum Kind : std::uint8_t {
     kMalloc = 0,
     kMmap = 1,
+    kCudaMalloc = 2,
+    kCudaMmap = 3,
   };
 
  private:
@@ -251,6 +300,20 @@ class ResourceHandler {
 
   [[nodiscard]] virtual std::size_t Size() const = 0;
   [[nodiscard]] auto Type() const { return kind_; }
+  [[nodiscard]] StringView TypeName() const {
+    switch (this->Type()) {
+      case kMalloc:
+        return "Malloc";
+      case kMmap:
+        return "Mmap";
+      case kCudaMalloc:
+        return "CudaMalloc";
+      case kCudaMmap:
+        return "CudaMmap";
+    }
+    LOG(FATAL) << "Unreachable.";
+    return {};
+  }
 
   // Allow exceptions for cleaning up resource.
   virtual ~ResourceHandler() noexcept(false);
@@ -339,11 +402,11 @@ class MallocResource : public ResourceHandler {
  * @brief A class for wrapping mmap as a resource for RAII.
  */
 class MmapResource : public ResourceHandler {
-  std::unique_ptr<MMAPFile> handle_;
+  std::unique_ptr<MMAPFile, std::function<void(MMAPFile*)>> handle_;
   std::size_t n_;
 
  public:
-  MmapResource(std::string path, std::size_t offset, std::size_t length);
+  MmapResource(StringView path, std::size_t offset, std::size_t length);
   ~MmapResource() noexcept(false) override;
 
   [[nodiscard]] void* Data() override;
@@ -471,9 +534,9 @@ class PrivateMmapConstStream : public AlignedResourceReadStream {
    * @param offset    See the `offset` parameter of `mmap` for details.
    * @param length    See the `length` parameter of `mmap` for details.
    */
-  explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
+  explicit PrivateMmapConstStream(StringView path, std::size_t offset, std::size_t length)
       : AlignedResourceReadStream{std::shared_ptr<MmapResource>{  // NOLINT
-            new MmapResource{std::move(path), offset, length}}} {}
+            new MmapResource{path, offset, length}}} {}
   ~PrivateMmapConstStream() noexcept(false) override;
 };
 
diff --git a/src/common/ref_resource_view.cuh b/src/common/ref_resource_view.cuh
new file mode 100644
index 000000000000..ff311c1409a7
--- /dev/null
+++ b/src/common/ref_resource_view.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2024, XGBoost Contributors
+ */
+#pragma once
+
+#include <cstddef>  // for size_t
+#include <memory>   // for make_shared
+
+#include "cuda_context.cuh"     // for CUDAContext
+#include "ref_resource_view.h"  // for RefResourceView
+#include "resource.cuh"         // for CudaAllocResource
+#include "xgboost/context.h"    // for Context
+
+namespace xgboost::common {
+/**
+ * @brief Make a fixed size `RefResourceView` with cudaMalloc resource.
+ */
+template <typename T>
+[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx,
+                                                            std::size_t n_elements, T const& init) {
+  auto resource = std::make_shared<common::CudaMallocResource>(n_elements * sizeof(T));
+  auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
+  thrust::fill_n(ctx->CUDACtx()->CTP(), ref.data(), ref.size(), init);
+  return ref;
+}
+}  // namespace xgboost::common
diff --git a/src/common/ref_resource_view.h b/src/common/ref_resource_view.h
index 61adfdb7bea8..81058d923d3b 100644
--- a/src/common/ref_resource_view.h
+++ b/src/common/ref_resource_view.h
@@ -43,24 +43,16 @@ class RefResourceView {
   }
 
  public:
-  RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem)
-      : ptr_{ptr}, size_{n}, mem_{std::move(mem)} {
-    CHECK_GE(mem_->Size(), n);
-  }
   /**
    * @brief Construct a view on ptr with length n. The ptr is held by the mem resource.
    *
    * @param ptr  The pointer to view.
    * @param n    The length of the view.
    * @param mem  The owner of the pointer.
-   * @param init Initialize the view with this value.
    */
-  RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem,
-                  T const& init)
-      : RefResourceView{ptr, n, mem} {
-    if (n != 0) {
-      std::fill_n(ptr_, n, init);
-    }
+  RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem)
+      : ptr_{ptr}, size_{n}, mem_{std::move(mem)} {
+    CHECK_GE(mem_->Size(), n);
   }
 
   ~RefResourceView() = default;
@@ -159,7 +151,9 @@ template <typename Vec>
 template <typename T>
 [[nodiscard]] RefResourceView<T> MakeFixedVecWithMalloc(std::size_t n_elements, T const& init) {
   auto resource = std::make_shared<common::MallocResource>(n_elements * sizeof(T));
-  return RefResourceView{resource->DataAs<T>(), n_elements, resource, init};
+  auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
+  std::fill_n(ref.data(), ref.size(), init);
+  return ref;
 }
 
 template <typename T>
diff --git a/src/common/resource.cu b/src/common/resource.cu
new file mode 100644
index 000000000000..ef662e3bd6e0
--- /dev/null
+++ b/src/common/resource.cu
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2024, XGBoost Contributors
+ */
+#include "device_helpers.cuh"  // for CurrentDevice
+#include "resource.cuh"
+#include "xgboost/string_view.h"  // for StringView
+
+namespace xgboost::common {
+CudaMmapResource::CudaMmapResource(StringView path, std::size_t offset, std::size_t length)
+    : ResourceHandler{kCudaMmap},
+      handle_{detail::OpenMmap(std::string{path}, offset, length),
+              [](MMAPFile* handle) {
+                // Don't close the mmap while CUDA kernel is running.
+                if (handle) {
+                  dh::DefaultStream().Sync();
+                }
+                detail::CloseMmap(handle);
+              }},
+      n_{length} {
+  auto device = dh::CurrentDevice();
+  dh::safe_cuda(
+      cudaMemAdvise(handle_->base_ptr, handle_->base_size, cudaMemAdviseSetReadMostly, device));
+  dh::safe_cuda(cudaMemAdvise(handle_->base_ptr, handle_->base_size,
+                              cudaMemAdviseSetPreferredLocation, device));
+  dh::safe_cuda(
+      cudaMemAdvise(handle_->base_ptr, handle_->base_size, cudaMemAdviseSetAccessedBy, device));
+  dh::safe_cuda(
+      cudaMemPrefetchAsync(handle_->base_ptr, handle_->base_size, device, dh::DefaultStream()));
+}
+
+[[nodiscard]] void* CudaMmapResource::Data() {
+  if (!handle_) {
+    return nullptr;
+  }
+  return this->handle_->Data();
+}
+
+[[nodiscard]] std::size_t CudaMmapResource::Size() const { return n_; }
+
+CudaMmapResource::~CudaMmapResource() noexcept(false) = default;
+
+PrivateCudaMmapConstStream::~PrivateCudaMmapConstStream() noexcept(false) = default;
+}  // namespace xgboost::common
diff --git a/src/common/resource.cuh b/src/common/resource.cuh
new file mode 100644
index 000000000000..90b9756a9fc2
--- /dev/null
+++ b/src/common/resource.cuh
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2024, XGBoost Contributors
+ */
+#pragma once
+#include <cstddef>     // for size_t
+#include <functional>  // for function
+
+#include "device_vector.cuh"      // for DeviceUVector
+#include "io.h"                   // for ResourceHandler, MMAPFile
+#include "xgboost/string_view.h"  // for StringView
+
+namespace xgboost::common {
+/**
+ * @brief Resource backed by `cudaMalloc`.
+ */
+class CudaMallocResource : public ResourceHandler {
+  dh::DeviceUVector<std::byte> storage_;
+
+  void Clear() noexcept(true) { this->Resize(0); }
+
+ public:
+  explicit CudaMallocResource(std::size_t n_bytes) : ResourceHandler{kCudaMalloc} {
+    this->Resize(n_bytes);
+  }
+  ~CudaMallocResource() noexcept(true) override { this->Clear(); }
+
+  void* Data() override { return storage_.data(); }
+  [[nodiscard]] std::size_t Size() const override { return storage_.size(); }
+  void Resize(std::size_t n_bytes, std::byte init = std::byte{0}) {
+    this->storage_.resize(n_bytes, init);
+  }
+};
+
+class CudaMmapResource : public ResourceHandler {
+  std::unique_ptr<MMAPFile, std::function<void(MMAPFile*)>> handle_;
+  std::size_t n_;
+
+ public:
+  CudaMmapResource() : ResourceHandler{kCudaMmap} {}
+  CudaMmapResource(StringView path, std::size_t offset, std::size_t length);
+  ~CudaMmapResource() noexcept(false) override;
+
+  [[nodiscard]] void* Data() override;
+  [[nodiscard]] std::size_t Size() const override;
+};
+
+class PrivateCudaMmapConstStream : public AlignedResourceReadStream {
+ public:
+  explicit PrivateCudaMmapConstStream(StringView path, std::size_t offset, std::size_t length)
+      : AlignedResourceReadStream{
+            std::shared_ptr<CudaMmapResource>{new CudaMmapResource{path, offset, length}}} {}
+  ~PrivateCudaMmapConstStream() noexcept(false) override;
+};
+}  // namespace xgboost::common
diff --git a/src/context.cc b/src/context.cc
index ef7110e7ce19..19060d5fc830 100644
--- a/src/context.cc
+++ b/src/context.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023 by XGBoost Contributors
+ * Copyright 2014-2024, XGBoost Contributors
  *
  * \brief Context object used for controlling runtime parameters.
  */
@@ -11,8 +11,9 @@
 #include <optional>   // for optional
 #include <regex>      // for regex_replace, regex_match
 
-#include "common/common.h"     // AssertGPUSupport
-#include "common/error_msg.h"  // WarnDeprecatedGPUId
+#include "common/common.h"         // AssertGPUSupport
+#include "common/cuda_rt_utils.h"  // for AllVisibleGPUs
+#include "common/error_msg.h"      // WarnDeprecatedGPUId
 #include "common/threading_utils.h"
 #include "xgboost/string_view.h"
 
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 81656284eb49..7d3f4c820a22 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -11,8 +11,9 @@
 #include "../common/categorical.h"
 #include "../common/cuda_context.cuh"
 #include "../common/hist_util.cuh"
-#include "../common/transform_iterator.h"  // MakeIndexTransformIter
-#include "device_adapter.cuh"              // for NoInfInData
+#include "../common/ref_resource_view.cuh"  // for MakeFixedVecWithCudaMalloc
+#include "../common/transform_iterator.h"   // MakeIndexTransformIter
+#include "device_adapter.cuh"               // for NoInfInData
 #include "ellpack_page.cuh"
 #include "ellpack_page.h"
 #include "gradient_index.h"
@@ -43,21 +44,19 @@ __global__ void CompressBinEllpackKernel(
     common::CompressedBufferWriter wr,
     common::CompressedByteT* __restrict__ buffer,  // gidx_buffer
     const size_t* __restrict__ row_ptrs,           // row offset of input data
-    const Entry* __restrict__ entries,      // One batch of input data
-    const float* __restrict__ cuts,         // HistogramCuts::cut_values_
-    const uint32_t* __restrict__ cut_ptrs,  // HistogramCuts::cut_ptrs_
+    const Entry* __restrict__ entries,             // One batch of input data
+    const float* __restrict__ cuts,                // HistogramCuts::cut_values_
+    const uint32_t* __restrict__ cut_ptrs,         // HistogramCuts::cut_ptrs_
     common::Span<FeatureType const> feature_types,
-    size_t base_row,                        // batch_row_begin
-    size_t n_rows,
-    size_t row_stride,
-    unsigned int null_gidx_value) {
+    size_t base_row,  // batch_row_begin
+    size_t n_rows, size_t row_stride, std::uint32_t null_gidx_value) {
   size_t irow = threadIdx.x + blockIdx.x * blockDim.x;
   int ifeature = threadIdx.y + blockIdx.y * blockDim.y;
   if (irow >= n_rows || ifeature >= row_stride) {
     return;
   }
   int row_length = static_cast<int>(row_ptrs[irow + 1] - row_ptrs[irow]);
-  unsigned int bin = null_gidx_value;
+  std::uint32_t bin = null_gidx_value;
   if (ifeature < row_length) {
     Entry entry = entries[row_ptrs[irow] - row_ptrs[0] + ifeature];
     int feature = entry.index;
@@ -89,25 +88,23 @@ __global__ void CompressBinEllpackKernel(
 }
 
 // Construct an ELLPACK matrix with the given number of empty rows.
-EllpackPageImpl::EllpackPageImpl(DeviceOrd device,
+EllpackPageImpl::EllpackPageImpl(Context const* ctx,
                                  std::shared_ptr<common::HistogramCuts const> cuts, bool is_dense,
                                  bst_idx_t row_stride, bst_idx_t n_rows)
     : is_dense(is_dense), cuts_(std::move(cuts)), row_stride{row_stride}, n_rows{n_rows} {
   monitor_.Init("ellpack_page");
-  dh::safe_cuda(cudaSetDevice(device.ordinal));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
 
-  monitor_.Start("InitCompressedData");
-  this->InitCompressedData(device);
-  monitor_.Stop("InitCompressedData");
+  this->InitCompressedData(ctx);
 }
 
-EllpackPageImpl::EllpackPageImpl(DeviceOrd device,
+EllpackPageImpl::EllpackPageImpl(Context const* ctx,
                                  std::shared_ptr<common::HistogramCuts const> cuts,
                                  const SparsePage& page, bool is_dense, size_t row_stride,
                                  common::Span<FeatureType const> feature_types)
     : cuts_(std::move(cuts)), is_dense(is_dense), n_rows(page.Size()), row_stride(row_stride) {
-  this->InitCompressedData(device);
-  this->CreateHistIndices(device, page, feature_types);
+  this->InitCompressedData(ctx);
+  this->CreateHistIndices(ctx->Device(), page, feature_types);
 }
 
 // Construct an ELLPACK matrix in memory.
@@ -129,9 +126,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
   }
   monitor_.Stop("Quantiles");
 
-  monitor_.Start("InitCompressedData");
-  this->InitCompressedData(ctx->Device());
-  monitor_.Stop("InitCompressedData");
+  this->InitCompressedData(ctx);
 
   dmat->Info().feature_types.SetDevice(ctx->Device());
   auto ft = dmat->Info().feature_types.ConstDeviceSpan();
@@ -234,7 +229,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
 
   auto device_accessor = dst->GetDeviceAccessor(device);
   common::CompressedBufferWriter writer(device_accessor.NumSymbols());
-  auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
+  auto d_compressed_buffer = dst->gidx_buffer.data();
 
   // We redirect the scan output into this functor to do the actual writing
   WriteCompressedEllpackFunctor<AdapterBatchT> functor(
@@ -275,7 +270,7 @@ void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t
   // Write the null values
   auto device_accessor = dst->GetDeviceAccessor(device);
   common::CompressedBufferWriter writer(device_accessor.NumSymbols());
-  auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
+  auto d_compressed_buffer = dst->gidx_buffer.data();
   auto row_stride = dst->row_stride;
   dh::LaunchN(row_stride * dst->n_rows, [=] __device__(size_t idx) {
     // For some reason this variable got captured as const
@@ -290,20 +285,20 @@ void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t
 }
 
 template <typename AdapterBatch>
-EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
-                                 common::Span<size_t> row_counts_span,
+EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing,
+                                 bool is_dense, common::Span<size_t> row_counts_span,
                                  common::Span<FeatureType const> feature_types, size_t row_stride,
                                  size_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts) {
-  dh::safe_cuda(cudaSetDevice(device.ordinal));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
 
-  *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
-  CopyDataToEllpack(batch, feature_types, this, device, missing);
-  WriteNullValues(this, device, row_counts_span);
+  *this = EllpackPageImpl(ctx, cuts, is_dense, row_stride, n_rows);
+  CopyDataToEllpack(batch, feature_types, this, ctx->Device(), missing);
+  WriteNullValues(this, ctx->Device(), row_counts_span);
 }
 
 #define ELLPACK_BATCH_SPECIALIZE(__BATCH_T)                                                \
   template EllpackPageImpl::EllpackPageImpl(                                               \
-      __BATCH_T batch, float missing, DeviceOrd device, bool is_dense,                     \
+      Context const* ctx, __BATCH_T batch, float missing, bool is_dense,                   \
       common::Span<size_t> row_counts_span, common::Span<FeatureType const> feature_types, \
       size_t row_stride, size_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts);
 
@@ -365,12 +360,10 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
   row_stride = *std::max_element(it, it + page.Size());
 
   CHECK(ctx->IsCUDA());
-  monitor_.Start("InitCompressedData");
-  InitCompressedData(ctx->Device());
-  monitor_.Stop("InitCompressedData");
+  InitCompressedData(ctx);
 
   // copy gidx
-  common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
+  common::CompressedByteT* d_compressed_buffer = gidx_buffer.data();
   dh::device_vector<size_t> row_ptr(page.row_ptr.size());
   auto d_row_ptr = dh::ToSpan(row_ptr);
   dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
@@ -389,20 +382,20 @@ struct CopyPage {
   // The number of elements to skip.
   size_t offset;
 
-  CopyPage(EllpackPageImpl *dst, EllpackPageImpl const *src, size_t offset)
-      : cbw{dst->NumSymbols()}, dst_data_d{dst->gidx_buffer.DevicePointer()},
-        src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
+  CopyPage(EllpackPageImpl* dst, EllpackPageImpl const* src, size_t offset)
+      : cbw{dst->NumSymbols()},
+        dst_data_d{dst->gidx_buffer.data()},
+        src_iterator_d{src->gidx_buffer.data(), src->NumSymbols()},
         offset(offset) {}
 
   __device__ void operator()(size_t element_id) {
-    cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id],
-                          element_id + offset);
+    cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id], element_id + offset);
   }
 };
 
 // Copy the data from the given EllpackPage to the current page.
-size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size_t offset) {
-  monitor_.Start("Copy");
+size_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page, bst_idx_t offset) {
+  monitor_.Start(__func__);
   bst_idx_t num_elements = page->n_rows * page->row_stride;
   CHECK_EQ(row_stride, page->row_stride);
   CHECK_EQ(NumSymbols(), page->NumSymbols());
@@ -411,10 +404,8 @@ size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size
     LOG(FATAL) << "Concatenating the same Ellpack.";
     return this->n_rows * this->row_stride;
   }
-  gidx_buffer.SetDevice(device);
-  page->gidx_buffer.SetDevice(device);
-  dh::LaunchN(num_elements, CopyPage(this, page, offset));
-  monitor_.Stop("Copy");
+  dh::LaunchN(num_elements, CopyPage{this, page, offset});
+  monitor_.Stop(__func__);
   return num_elements;
 }
 
@@ -423,8 +414,8 @@ struct CompactPage {
   common::CompressedBufferWriter cbw;
   common::CompressedByteT* dst_data_d;
   common::CompressedIterator<uint32_t> src_iterator_d;
-  /*! \brief An array that maps the rows from the full DMatrix to the compacted
-   * page.
+  /**
+   * @brief An array that maps the rows from the full DMatrix to the compacted page.
    *
    * The total size is the number of rows in the original, uncompacted DMatrix.
    * Elements are the row ids in the compacted page. Rows not needed are set to
@@ -438,24 +429,24 @@ struct CompactPage {
   size_t base_rowid;
   size_t row_stride;
 
-  CompactPage(EllpackPageImpl* dst, EllpackPageImpl const* src,
-              common::Span<size_t> row_indexes)
+  CompactPage(EllpackPageImpl* dst, EllpackPageImpl const* src, common::Span<size_t> row_indexes)
       : cbw{dst->NumSymbols()},
-        dst_data_d{dst->gidx_buffer.DevicePointer()},
-        src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
+        dst_data_d{dst->gidx_buffer.data()},
+        src_iterator_d{src->gidx_buffer.data(), src->NumSymbols()},
         row_indexes(row_indexes),
         base_rowid{src->base_rowid},
         row_stride{src->row_stride} {}
 
-  __device__ void operator()(size_t row_id) {
+  __device__ void operator()(bst_idx_t row_id) {
     size_t src_row = base_rowid + row_id;
     size_t dst_row = row_indexes[src_row];
-    if (dst_row == SIZE_MAX) return;
+    if (dst_row == SIZE_MAX) {
+      return;
+    }
     size_t dst_offset = dst_row * row_stride;
     size_t src_offset = row_id * row_stride;
     for (size_t j = 0; j < row_stride; j++) {
-      cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j],
-                            dst_offset + j);
+      cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j], dst_offset + j);
     }
   }
 };
@@ -467,28 +458,22 @@ void EllpackPageImpl::Compact(Context const* ctx, EllpackPageImpl const* page,
   CHECK_EQ(row_stride, page->row_stride);
   CHECK_EQ(NumSymbols(), page->NumSymbols());
   CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size());
-  gidx_buffer.SetDevice(ctx->Device());
-  page->gidx_buffer.SetDevice(ctx->Device());
   auto cuctx = ctx->CUDACtx();
-  dh::LaunchN(page->n_rows, cuctx->Stream(), CompactPage(this, page, row_indexes));
+  dh::LaunchN(page->n_rows, cuctx->Stream(), CompactPage{this, page, row_indexes});
   monitor_.Stop(__func__);
 }
 
 // Initialize the buffer to stored compressed features.
-void EllpackPageImpl::InitCompressedData(DeviceOrd device) {
-  size_t num_symbols = NumSymbols();
+void EllpackPageImpl::InitCompressedData(Context const* ctx) {
+  monitor_.Start(__func__);
+  auto num_symbols = NumSymbols();
 
   // Required buffer size for storing data matrix in ELLPack format.
-  size_t compressed_size_bytes =
+  std::size_t compressed_size_bytes =
       common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols);
-  gidx_buffer.SetDevice(device);
-  // Don't call fill unnecessarily
-  if (gidx_buffer.Size() == 0) {
-    gidx_buffer.Resize(compressed_size_bytes, 0);
-  } else {
-    gidx_buffer.Resize(compressed_size_bytes, 0);
-    thrust::fill(dh::tbegin(gidx_buffer), dh::tend(gidx_buffer), 0);
-  }
+  auto init = static_cast<common::CompressedByteT>(0);
+  gidx_buffer = common::MakeFixedVecWithCudaMalloc(ctx, compressed_size_bytes, init);
+  monitor_.Stop(__func__);
 }
 
 // Compress a CSR page into ELLPACK.
@@ -496,7 +481,7 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
                                         const SparsePage& row_batch,
                                         common::Span<FeatureType const> feature_types) {
   if (row_batch.Size() == 0) return;
-  unsigned int null_gidx_value = NumSymbols() - 1;
+  std::uint32_t null_gidx_value = NumSymbols() - 1;
 
   const auto& offset_vec = row_batch.offset.ConstHostVector();
 
@@ -541,13 +526,11 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
     const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
                      common::DivRoundUp(row_stride, block3.y), 1);
     auto device_accessor = GetDeviceAccessor(device);
-    dh::LaunchKernel {grid3, block3}(
-        CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()),
-        gidx_buffer.DevicePointer(), row_ptrs.data().get(),
-        entries_d.data().get(), device_accessor.gidx_fvalue_map.data(),
-        device_accessor.feature_segments.data(), feature_types,
-        batch_row_begin, batch_nrows, row_stride,
-        null_gidx_value);
+    dh::LaunchKernel{grid3, block3}(  // NOLINT
+        CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()), gidx_buffer.data(),
+        row_ptrs.data().get(), entries_d.data().get(), device_accessor.gidx_fvalue_map.data(),
+        device_accessor.feature_segments.data(), feature_types, batch_row_begin, batch_nrows,
+        row_stride, null_gidx_value);
   }
 }
 
@@ -566,26 +549,31 @@ size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
 
 EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
     DeviceOrd device, common::Span<FeatureType const> feature_types) const {
-  gidx_buffer.SetDevice(device);
   return {device,
           cuts_,
           is_dense,
           row_stride,
           base_rowid,
           n_rows,
-          common::CompressedIterator<uint32_t>(gidx_buffer.ConstDevicePointer(),
-                                               NumSymbols()),
+          common::CompressedIterator<uint32_t>(gidx_buffer.data(), NumSymbols()),
           feature_types};
 }
+
 EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
+    Context const* ctx, std::vector<common::CompressedByteT>* h_gidx_buffer,
     common::Span<FeatureType const> feature_types) const {
+  h_gidx_buffer->resize(gidx_buffer.size());
+  CHECK_EQ(h_gidx_buffer->size(), gidx_buffer.size());
+  CHECK_NE(gidx_buffer.size(), 0);
+  dh::safe_cuda(cudaMemcpyAsync(h_gidx_buffer->data(), gidx_buffer.data(), gidx_buffer.size_bytes(),
+                                cudaMemcpyDefault, dh::DefaultStream()));
   return {DeviceOrd::CPU(),
           cuts_,
           is_dense,
           row_stride,
           base_rowid,
           n_rows,
-          common::CompressedIterator<uint32_t>(gidx_buffer.ConstHostPointer(), NumSymbols()),
+          common::CompressedIterator<uint32_t>(h_gidx_buffer->data(), NumSymbols()),
           feature_types};
 }
 }  // namespace xgboost
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index d1f9472df4c4..18b9384afbd7 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -1,23 +1,25 @@
 /**
- * Copyright 2019-2023, XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
  */
-
 #ifndef XGBOOST_DATA_ELLPACK_PAGE_CUH_
 #define XGBOOST_DATA_ELLPACK_PAGE_CUH_
 
 #include <thrust/binary_search.h>
-#include <xgboost/data.h>
 
 #include "../common/categorical.h"
 #include "../common/compressed_iterator.h"
 #include "../common/device_helpers.cuh"
 #include "../common/hist_util.h"
+#include "../common/ref_resource_view.h"  // for RefResourceView
 #include "ellpack_page.h"
+#include "xgboost/data.h"
 
 namespace xgboost {
-/** \brief Struct for accessing and manipulating an ELLPACK matrix on the
- * device. Does not own underlying memory and may be trivially copied into
- * kernels.*/
+/**
+ * @brief Struct for accessing and manipulating an ELLPACK matrix on the device.
+ *
+ * Does not own underlying memory and may be trivially copied into kernels.
+ */
 struct EllpackDeviceAccessor {
   /*! \brief Whether or not if the matrix is dense. */
   bool is_dense;
@@ -128,31 +130,31 @@ class GHistIndexMatrix;
 
 class EllpackPageImpl {
  public:
-  /*!
-   * \brief Default constructor.
+  /**
+   * @brief Default constructor.
    *
    * This is used in the external memory case. An empty ELLPACK page is constructed with its content
    * set later by the reader.
    */
   EllpackPageImpl() = default;
 
-  /*!
-   * \brief Constructor from an existing EllpackInfo.
+  /**
+   * @brief Constructor from an existing EllpackInfo.
    *
-   * This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
-   * and the given number of rows.
+   * This is used in the sampling case. The ELLPACK page is constructed from an existing
+   * Ellpack page and the given number of rows.
    */
-  EllpackPageImpl(DeviceOrd device, std::shared_ptr<common::HistogramCuts const> cuts,
+  EllpackPageImpl(Context const* ctx, std::shared_ptr<common::HistogramCuts const> cuts,
                   bool is_dense, bst_idx_t row_stride, bst_idx_t n_rows);
-  /*!
-   * \brief Constructor used for external memory.
+  /**
+   * @brief Constructor used for external memory.
    */
-  EllpackPageImpl(DeviceOrd device, std::shared_ptr<common::HistogramCuts const> cuts,
+  EllpackPageImpl(Context const* ctx, std::shared_ptr<common::HistogramCuts const> cuts,
                   const SparsePage& page, bool is_dense, size_t row_stride,
                   common::Span<FeatureType const> feature_types);
 
-  /*!
-   * \brief Constructor from an existing DMatrix.
+  /**
+   * @brief Constructor from an existing DMatrix.
    *
    * This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
    * in CSR format.
@@ -160,37 +162,39 @@ class EllpackPageImpl {
   explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);
 
   template <typename AdapterBatch>
-  explicit EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
+  explicit EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing, bool is_dense,
                            common::Span<size_t> row_counts_span,
                            common::Span<FeatureType const> feature_types, size_t row_stride,
                            size_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts);
   /**
-   * \brief Constructor from an existing CPU gradient index.
+   * @brief Constructor from an existing CPU gradient index.
    */
   explicit EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& page,
                            common::Span<FeatureType const> ft);
 
-  /*! \brief Copy the elements of the given ELLPACK page into this page.
+  /**
+   * @brief Copy the elements of the given ELLPACK page into this page.
    *
-   * @param device The GPU device to use.
+   * @param ctx The GPU context.
    * @param page The ELLPACK page to copy from.
    * @param offset The number of elements to skip before copying.
    * @returns The number of elements copied.
    */
-  size_t Copy(DeviceOrd device, EllpackPageImpl const *page, size_t offset);
+  bst_idx_t Copy(Context const* ctx, EllpackPageImpl const* page, bst_idx_t offset);
 
-  /*! \brief Compact the given ELLPACK page into the current page.
+  /**
+   * @brief Compact the given ELLPACK page into the current page.
    *
-   * @param context The GPU context.
+   * @param ctx The GPU context.
    * @param page The ELLPACK page to compact from.
    * @param row_indexes Row indexes for the compacted page.
    */
   void Compact(Context const* ctx, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
 
-  /*! \return Number of instances in the page. */
+  /** @return Number of instances in the page. */
   [[nodiscard]] bst_idx_t Size() const;
 
-  /*! \brief Set the base row id for this page. */
+  /** @brief Set the base row id for this page. */
   void SetBaseRowId(std::size_t row_id) {
     base_rowid = row_id;
   }
@@ -199,43 +203,54 @@ class EllpackPageImpl {
   [[nodiscard]] std::shared_ptr<common::HistogramCuts const> CutsShared() const { return cuts_; }
   void SetCuts(std::shared_ptr<common::HistogramCuts const> cuts) { cuts_ = cuts; }
 
-  /*! \return Estimation of memory cost of this page. */
+  /** @return Estimation of memory cost of this page. */
   static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ;
 
 
-  /*! \brief Return the total number of symbols (total number of bins plus 1 for
-   * not found). */
+  /**
+   * @brief Return the total number of symbols (total number of bins plus 1 for not
+   *        found).
+   */
   [[nodiscard]] std::size_t NumSymbols() const { return cuts_->TotalBins() + 1; }
-
+  /**
+   * @brief Get an accessor that can be passed into CUDA kernels.
+   */
   [[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
       DeviceOrd device, common::Span<FeatureType const> feature_types = {}) const;
+  /**
+   * @brief Get an accessor for host code.
+   */
   [[nodiscard]] EllpackDeviceAccessor GetHostAccessor(
+      Context const* ctx, std::vector<common::CompressedByteT>* h_gidx_buffer,
       common::Span<FeatureType const> feature_types = {}) const;
 
  private:
-  /*!
-   * \brief Compress a single page of CSR data into ELLPACK.
+  /**
+   * @brief Compress a single page of CSR data into ELLPACK.
    *
    * @param device The GPU device to use.
    * @param row_batch The CSR page.
    */
-  void CreateHistIndices(DeviceOrd device,
-                         const SparsePage& row_batch,
+  void CreateHistIndices(DeviceOrd device, const SparsePage& row_batch,
                          common::Span<FeatureType const> feature_types);
-  /*!
-   * \brief Initialize the buffer to store compressed features.
+  /**
+   * @brief Initialize the buffer to store compressed features.
    */
-  void InitCompressedData(DeviceOrd device);
+  void InitCompressedData(Context const* ctx);
 
  public:
-  /*! \brief Whether or not if the matrix is dense. */
+  /** @brief Whether or not if the matrix is dense. */
   bool is_dense;
-  /*! \brief Row length for ELLPACK. */
+  /** @brief Row length for ELLPACK. */
   bst_idx_t row_stride;
   bst_idx_t base_rowid{0};
-  bst_idx_t n_rows{};
-  /*! \brief global index of histogram, which is stored in ELLPACK format. */
-  HostDeviceVector<common::CompressedByteT> gidx_buffer;
+  bst_idx_t n_rows{0};
+  /**
+   * @brief Index of the gradient histogram, which is stored in ELLPACK format.
+   *
+   * This can be backed by various storage types.
+   */
+  common::RefResourceView<common::CompressedByteT> gidx_buffer;
 
  private:
   std::shared_ptr<common::HistogramCuts const> cuts_;
diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
index 059dd9f213a5..3f23c5d8d3d6 100644
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -4,11 +4,12 @@
 #include <dmlc/registry.h>
 
 #include <cstddef>  // for size_t
-#include <cstdint>  // for uint64_t
+#include <vector>   // for vector
 
-#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
-#include "../common/ref_resource_view.h"  // for ReadVec, WriteVec
-#include "ellpack_page.cuh"               // for EllpackPage
+#include "../common/io.h"                   // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "../common/ref_resource_view.cuh"  // for MakeFixedVecWithCudaMalloc
+#include "../common/ref_resource_view.h"    // for ReadVec, WriteVec
+#include "ellpack_page.cuh"                 // for EllpackPage
 #include "ellpack_page_raw_format.h"
 #include "ellpack_page_source.h"
 
@@ -16,8 +17,10 @@ namespace xgboost::data {
 DMLC_REGISTRY_FILE_TAG(ellpack_page_raw_format);
 
 namespace {
+// Function to support system without HMM or ATS
 template <typename T>
-[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi, HostDeviceVector<T>* vec) {
+[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi,
+                                 common::RefResourceView<T>* vec) {
   std::uint64_t n{0};
   if (!fi->Read(&n)) {
     return false;
@@ -33,34 +36,34 @@ template <typename T>
     return false;
   }
 
-  vec->Resize(n);
-  auto d_vec = vec->DeviceSpan();
-  dh::safe_cuda(
-      cudaMemcpyAsync(d_vec.data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream()));
+  auto ctx = Context{}.MakeCUDA(common::CurrentDevice());
+  *vec = common::MakeFixedVecWithCudaMalloc(&ctx, n, static_cast<T>(0));
+  dh::safe_cuda(cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream()));
   return true;
 }
 }  // namespace
 
+#define RET_IF_NOT(expr) \
+  if (!(expr)) {         \
+    return false;        \
+  }
+
 [[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page,
                                               common::AlignedResourceReadStream* fi) {
   auto* impl = page->Impl();
+
   impl->SetCuts(this->cuts_);
-  if (!fi->Read(&impl->n_rows)) {
-    return false;
-  }
-  if (!fi->Read(&impl->is_dense)) {
-    return false;
-  }
-  if (!fi->Read(&impl->row_stride)) {
-    return false;
-  }
-  impl->gidx_buffer.SetDevice(device_);
-  if (!ReadDeviceVec(fi, &impl->gidx_buffer)) {
-    return false;
-  }
-  if (!fi->Read(&impl->base_rowid)) {
-    return false;
+  RET_IF_NOT(fi->Read(&impl->n_rows));
+  RET_IF_NOT(fi->Read(&impl->is_dense));
+  RET_IF_NOT(fi->Read(&impl->row_stride));
+
+  if (has_hmm_ats_) {
+    RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer));
+  } else {
+    RET_IF_NOT(ReadDeviceVec(fi, &impl->gidx_buffer));
   }
+  RET_IF_NOT(fi->Read(&impl->base_rowid));
+  dh::DefaultStream().Sync();
   return true;
 }
 
@@ -71,8 +74,10 @@ template <typename T>
   bytes += fo->Write(impl->n_rows);
   bytes += fo->Write(impl->is_dense);
   bytes += fo->Write(impl->row_stride);
-  CHECK(!impl->gidx_buffer.ConstHostVector().empty());
-  bytes += common::WriteVec(fo, impl->gidx_buffer.HostVector());
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
+  [[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer);
+  bytes += common::WriteVec(fo, h_gidx_buffer);
   bytes += fo->Write(impl->base_rowid);
   dh::DefaultStream().Sync();
   return bytes;
@@ -82,33 +87,20 @@ template <typename T>
   auto* impl = page->Impl();
   CHECK(this->cuts_->cut_values_.DeviceCanRead());
   impl->SetCuts(this->cuts_);
-  if (!fi->Read(&impl->n_rows)) {
-    return false;
-  }
-  if (!fi->Read(&impl->is_dense)) {
-    return false;
-  }
-  if (!fi->Read(&impl->row_stride)) {
-    return false;
-  }
+  RET_IF_NOT(fi->Read(&impl->n_rows));
+  RET_IF_NOT(fi->Read(&impl->is_dense));
+  RET_IF_NOT(fi->Read(&impl->row_stride));
 
   // Read vec
+  Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
   bst_idx_t n{0};
-  if (!fi->Read(&n)) {
-    return false;
-  }
+  RET_IF_NOT(fi->Read(&n));
   if (n != 0) {
-    impl->gidx_buffer.SetDevice(device_);
-    impl->gidx_buffer.Resize(n);
-    auto span = impl->gidx_buffer.DeviceSpan();
-    if (!fi->Read(span.data(), span.size_bytes())) {
-      return false;
-    }
-  }
-
-  if (!fi->Read(&impl->base_rowid)) {
-    return false;
+    impl->gidx_buffer =
+        common::MakeFixedVecWithCudaMalloc(&ctx, n, static_cast<common::CompressedByteT>(0));
+    RET_IF_NOT(fi->Read(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes()));
   }
+  RET_IF_NOT(fi->Read(&impl->base_rowid));
 
   dh::DefaultStream().Sync();
   return true;
@@ -123,16 +115,17 @@ template <typename T>
   bytes += fo->Write(impl->row_stride);
 
   // Write vector
-  bst_idx_t n = impl->gidx_buffer.Size();
+  bst_idx_t n = impl->gidx_buffer.size();
   bytes += fo->Write(n);
 
-  if (!impl->gidx_buffer.Empty()) {
-    auto span = impl->gidx_buffer.ConstDeviceSpan();
-    bytes += fo->Write(span.data(), span.size_bytes());
+  if (!impl->gidx_buffer.empty()) {
+    bytes += fo->Write(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes());
   }
   bytes += fo->Write(impl->base_rowid);
 
   dh::DefaultStream().Sync();
   return bytes;
 }
+
+#undef RET_IF_NOT
 }  // namespace xgboost::data
diff --git a/src/data/ellpack_page_raw_format.h b/src/data/ellpack_page_raw_format.h
index 8c3f89f0c0b3..e2761c73f27c 100644
--- a/src/data/ellpack_page_raw_format.h
+++ b/src/data/ellpack_page_raw_format.h
@@ -26,10 +26,13 @@ class EllpackHostCacheStream;
 class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
   std::shared_ptr<common::HistogramCuts const> cuts_;
   DeviceOrd device_;
+  // Supports CUDA HMM or ATS
+  bool has_hmm_ats_{false};
 
  public:
-  explicit EllpackPageRawFormat(std::shared_ptr<common::HistogramCuts const> cuts, DeviceOrd device)
-      : cuts_{std::move(cuts)}, device_{device} {}
+  explicit EllpackPageRawFormat(std::shared_ptr<common::HistogramCuts const> cuts, DeviceOrd device,
+                                bool has_hmm_ats)
+      : cuts_{std::move(cuts)}, device_{device}, has_hmm_ats_{has_hmm_ats} {}
   [[nodiscard]] bool Read(EllpackPage* page, common::AlignedResourceReadStream* fi) override;
   [[nodiscard]] std::size_t Write(const EllpackPage& page,
                                   common::AlignedFileWriteStream* fo) override;
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index f53ae3ef1e2f..a70d9150ca47 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -11,6 +11,7 @@
 #include "../common/common.h"                 // for safe_cuda
 #include "../common/cuda_pinned_allocator.h"  // for pinned_allocator
 #include "../common/device_helpers.cuh"       // for CUDAStreamView, DefaultStream
+#include "../common/resource.cuh"             // for PrivateCudaMmapConstStream
 #include "ellpack_page.cuh"                   // for EllpackPageImpl
 #include "ellpack_page.h"                     // for EllpackPage
 #include "ellpack_page_source.h"
@@ -86,16 +87,16 @@ void EllpackHostCacheStream::Seek(bst_idx_t offset_bytes) { this->p_impl_->Seek(
 void EllpackHostCacheStream::Bound(bst_idx_t offset_bytes) { this->p_impl_->Bound(offset_bytes); }
 
 /**
- * EllpackFormatType
+ * EllpackCacheStreamPolicy
  */
 
 template <typename S, template <typename> typename F>
-EllpackFormatStreamPolicy<S, F>::EllpackFormatStreamPolicy()
+EllpackCacheStreamPolicy<S, F>::EllpackCacheStreamPolicy()
     : p_cache_{std::make_shared<EllpackHostCache>()} {}
 
 template <typename S, template <typename> typename F>
-[[nodiscard]] std::unique_ptr<typename EllpackFormatStreamPolicy<S, F>::WriterT>
-EllpackFormatStreamPolicy<S, F>::CreateWriter(StringView, std::uint32_t iter) {
+[[nodiscard]] std::unique_ptr<typename EllpackCacheStreamPolicy<S, F>::WriterT>
+EllpackCacheStreamPolicy<S, F>::CreateWriter(StringView, std::uint32_t iter) {
   auto fo = std::make_unique<EllpackHostCacheStream>(this->p_cache_);
   if (iter == 0) {
     CHECK(this->p_cache_->cache.empty());
@@ -106,9 +107,8 @@ EllpackFormatStreamPolicy<S, F>::CreateWriter(StringView, std::uint32_t iter) {
 }
 
 template <typename S, template <typename> typename F>
-[[nodiscard]] std::unique_ptr<typename EllpackFormatStreamPolicy<S, F>::ReaderT>
-EllpackFormatStreamPolicy<S, F>::CreateReader(StringView, bst_idx_t offset,
-                                              bst_idx_t length) const {
+[[nodiscard]] std::unique_ptr<typename EllpackCacheStreamPolicy<S, F>::ReaderT>
+EllpackCacheStreamPolicy<S, F>::CreateReader(StringView, bst_idx_t offset, bst_idx_t length) const {
   auto fi = std::make_unique<ReaderT>(this->p_cache_);
   fi->Seek(offset);
   fi->Bound(offset + length);
@@ -117,18 +117,40 @@ EllpackFormatStreamPolicy<S, F>::CreateReader(StringView, bst_idx_t offset,
 }
 
 // Instantiation
-template EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::EllpackFormatStreamPolicy();
+template EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::EllpackCacheStreamPolicy();
 
 template std::unique_ptr<
-    typename EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::WriterT>
-EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateWriter(StringView name,
-                                                                          std::uint32_t iter);
+    typename EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::WriterT>
+EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateWriter(StringView name,
+                                                                         std::uint32_t iter);
 
 template std::unique_ptr<
-    typename EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::ReaderT>
-EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(
+    typename EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::ReaderT>
+EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(
     StringView name, std::uint64_t offset, std::uint64_t length) const;
 
+/**
+ * EllpackMmapStreamPolicy
+ */
+
+template <typename S, template <typename> typename F>
+[[nodiscard]] std::unique_ptr<typename EllpackMmapStreamPolicy<S, F>::ReaderT>
+EllpackMmapStreamPolicy<S, F>::CreateReader(StringView name, bst_idx_t offset,
+                                            bst_idx_t length) const {
+  if (has_hmm_) {
+    return std::make_unique<common::PrivateCudaMmapConstStream>(name, offset, length);
+  } else {
+    return std::make_unique<common::PrivateMmapConstStream>(name, offset, length);
+  }
+}
+
+// Instantiation
+template std::unique_ptr<
+    typename EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>::ReaderT>
+EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(StringView name,
+                                                                        bst_idx_t offset,
+                                                                        bst_idx_t length) const;
+
 /**
  * EllpackPageSourceImpl
  */
@@ -146,8 +168,8 @@ void EllpackPageSourceImpl<F>::Fetch() {
     auto const& csr = this->source_->Page();
     this->page_.reset(new EllpackPage{});
     auto* impl = this->page_->Impl();
-    *impl = EllpackPageImpl{this->Device(), this->GetCuts(), *csr,
-                            is_dense_,      row_stride_,     feature_types_};
+    Context ctx = Context{}.MakeCUDA(this->Device().ordinal);
+    *impl = EllpackPageImpl{&ctx, this->GetCuts(), *csr, is_dense_, row_stride_, feature_types_};
     this->page_->SetBaseRowId(csr->base_rowid);
     this->WriteCache();
   }
@@ -157,5 +179,7 @@ void EllpackPageSourceImpl<F>::Fetch() {
 template void
 EllpackPageSourceImpl<DefaultFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>::Fetch();
 template void
-EllpackPageSourceImpl<EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>::Fetch();
+EllpackPageSourceImpl<EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>>::Fetch();
+template void
+EllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>::Fetch();
 }  // namespace xgboost::data
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index 7f50899b974d..1436f9151cf1 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -9,6 +9,7 @@
 #include <memory>   // for shared_ptr
 #include <utility>  // for move
 
+#include "../common/cuda_rt_utils.h"  // for SupportsPageableMem
 #include "../common/hist_util.h"      // for HistogramCuts
 #include "ellpack_page.h"             // for EllpackPage
 #include "ellpack_page_raw_format.h"  // for EllpackPageRawFormat
@@ -59,14 +60,19 @@ template <typename S>
 class EllpackFormatPolicy {
   std::shared_ptr<common::HistogramCuts const> cuts_{nullptr};
   DeviceOrd device_;
+  bool has_hmm_{common::SupportsPageableMem()};
 
  public:
   using FormatT = EllpackPageRawFormat;
 
  public:
+  EllpackFormatPolicy() = default;
+  // For testing with the HMM flag.
+  explicit EllpackFormatPolicy(bool has_hmm) : has_hmm_{has_hmm} {}
+
   [[nodiscard]] auto CreatePageFormat() const {
     CHECK_EQ(cuts_->cut_values_.Device(), device_);
-    std::unique_ptr<FormatT> fmt{new EllpackPageRawFormat{cuts_, device_}};
+    std::unique_ptr<FormatT> fmt{new EllpackPageRawFormat{cuts_, device_, has_hmm_}};
     return fmt;
   }
 
@@ -83,7 +89,7 @@ class EllpackFormatPolicy {
 };
 
 template <typename S, template <typename> typename F>
-class EllpackFormatStreamPolicy : public F<S> {
+class EllpackCacheStreamPolicy : public F<S> {
   std::shared_ptr<EllpackHostCache> p_cache_;
 
  public:
@@ -91,13 +97,42 @@ class EllpackFormatStreamPolicy : public F<S> {
   using ReaderT = EllpackHostCacheStream;
 
  public:
-  EllpackFormatStreamPolicy();
+  EllpackCacheStreamPolicy();
   [[nodiscard]] std::unique_ptr<WriterT> CreateWriter(StringView name, std::uint32_t iter);
 
   [[nodiscard]] std::unique_ptr<ReaderT> CreateReader(StringView name, bst_idx_t offset,
                                                       bst_idx_t length) const;
 };
 
+template <typename S, template <typename> typename F>
+class EllpackMmapStreamPolicy : public F<S> {
+  bool has_hmm_{common::SupportsPageableMem()};
+
+ public:
+  using WriterT = common::AlignedFileWriteStream;
+  using ReaderT = common::AlignedResourceReadStream;
+
+ public:
+  EllpackMmapStreamPolicy() = default;
+  // For testing with the HMM flag.
+  template <
+      typename std::enable_if_t<std::is_same_v<F<S>, EllpackFormatPolicy<EllpackPage>>>* = nullptr>
+  explicit EllpackMmapStreamPolicy(bool has_hmm) : F<S>{has_hmm}, has_hmm_{has_hmm} {}
+
+  [[nodiscard]] std::unique_ptr<WriterT> CreateWriter(StringView name, std::uint32_t iter) {
+    std::unique_ptr<common::AlignedFileWriteStream> fo;
+    if (iter == 0) {
+      fo = std::make_unique<common::AlignedFileWriteStream>(name, "wb");
+    } else {
+      fo = std::make_unique<common::AlignedFileWriteStream>(name, "ab");
+    }
+    return fo;
+  }
+
+  [[nodiscard]] std::unique_ptr<ReaderT> CreateReader(StringView name, bst_idx_t offset,
+                                                      bst_idx_t length) const;
+};
+
 template <typename F>
 class EllpackPageSourceImpl : public PageSourceIncMixIn<EllpackPage, F> {
   using Super = PageSourceIncMixIn<EllpackPage, F>;
@@ -128,11 +163,11 @@ class EllpackPageSourceImpl : public PageSourceIncMixIn<EllpackPage, F> {
 
 // Cache to host
 using EllpackPageHostSource =
-    EllpackPageSourceImpl<EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>;
+    EllpackPageSourceImpl<EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>>;
 
 // Cache to disk
 using EllpackPageSource =
-    EllpackPageSourceImpl<DefaultFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>;
+    EllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>;
 
 #if !defined(XGBOOST_USE_CUDA)
 template <typename F>
diff --git a/src/data/gradient_index.cu b/src/data/gradient_index.cu
index 42018eab47e3..f8c8f8d48970 100644
--- a/src/data/gradient_index.cu
+++ b/src/data/gradient_index.cu
@@ -16,7 +16,8 @@ template <typename BinT, typename CompressOffset>
 void SetIndexData(Context const* ctx, EllpackPageImpl const* page,
                   std::vector<size_t>* p_hit_count_tloc, CompressOffset&& get_offset,
                   GHistIndexMatrix* out) {
-  auto accessor = page->GetHostAccessor();
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer);
   auto const kNull = static_cast<bst_bin_t>(accessor.NullValue());
 
   common::Span<BinT> index_data_span = {out->index.data<BinT>(), out->index.Size()};
@@ -47,7 +48,8 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
   if (page->is_dense) {
     std::fill(row_ptr.begin() + 1, row_ptr.end(), page->row_stride);
   } else {
-    auto accessor = page->GetHostAccessor();
+    std::vector<common::CompressedByteT> h_gidx_buffer;
+    auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer);
     auto const kNull = static_cast<bst_bin_t>(accessor.NullValue());
 
     common::ParallelFor(page->Size(), ctx->Threads(), [&](auto i) {
diff --git a/src/data/histogram_cut_format.h b/src/data/histogram_cut_format.h
deleted file mode 100644
index d4eb81ad2849..000000000000
--- a/src/data/histogram_cut_format.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Copyright 2021-2024, XGBoost contributors
- */
-#ifndef XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
-#define XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
-
-#include <dmlc/io.h>  // for Stream
-
-#include <cstddef>  // for size_t
-
-#include "../common/hist_util.h"          // for HistogramCuts
-#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
-#include "../common/ref_resource_view.h"  // for WriteVec, ReadVec
-
-namespace xgboost::data {
-inline bool ReadHistogramCuts(common::HistogramCuts *cuts, common::AlignedResourceReadStream *fi) {
-  if (!common::ReadVec(fi, &cuts->cut_values_.HostVector())) {
-    return false;
-  }
-  if (!common::ReadVec(fi, &cuts->cut_ptrs_.HostVector())) {
-    return false;
-  }
-  if (!common::ReadVec(fi, &cuts->min_vals_.HostVector())) {
-    return false;
-  }
-  bool has_cat{false};
-  if (!fi->Read(&has_cat)) {
-    return false;
-  }
-  decltype(cuts->MaxCategory()) max_cat{0};
-  if (!fi->Read(&max_cat)) {
-    return false;
-  }
-  cuts->SetCategorical(has_cat, max_cat);
-  return true;
-}
-
-inline std::size_t WriteHistogramCuts(common::HistogramCuts const &cuts,
-                                      common::AlignedFileWriteStream *fo) {
-  std::size_t bytes = 0;
-  bytes += common::WriteVec(fo, cuts.Values());
-  bytes += common::WriteVec(fo, cuts.Ptrs());
-  bytes += common::WriteVec(fo, cuts.MinValues());
-  bytes += fo->Write(cuts.HasCategorical());
-  bytes += fo->Write(cuts.MaxCategory());
-  return bytes;
-}
-}  // namespace xgboost::data
-#endif  // XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 2e8da2c7e7ed..0cb32c5aa107 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -5,6 +5,7 @@
 #include <memory>
 
 #include "../collective/allreduce.h"
+#include "../common/cuda_rt_utils.h"  // for AllVisibleGPUs
 #include "../common/hist_util.cuh"
 #include "batch_utils.h"  // for RegenGHist
 #include "device_adapter.cuh"
@@ -45,11 +46,17 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
 
   int32_t current_device;
   dh::safe_cuda(cudaGetDevice(&current_device));
+  auto get_ctx = [&]() {
+    Context d_ctx = (ctx->IsCUDA()) ? *ctx : Context{}.MakeCUDA(current_device);
+    CHECK(!d_ctx.IsCPU());
+    return d_ctx;
+  };
   auto get_device = [&]() {
     auto d = (ctx->IsCUDA()) ? ctx->Device() : DeviceOrd::CUDA(current_device);
     CHECK(!d.IsCPU());
     return d;
   };
+  fmat_ctx_ = get_ctx();
 
   /**
    * Generate quantiles
@@ -118,7 +125,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
       // that case device id is invalid.
       ellpack_.reset(new EllpackPage);
       *(ellpack_->Impl()) =
-          EllpackPageImpl(get_device(), cuts, this->IsDense(), row_stride, accumulated_rows);
+          EllpackPageImpl(&fmat_ctx_, cuts, this->IsDense(), row_stride, accumulated_rows);
     }
   };
 
@@ -142,10 +149,10 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
     proxy->Info().feature_types.SetDevice(get_device());
     auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan();
     auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) {
-      return EllpackPageImpl(value, missing, get_device(), is_dense, row_counts_span,
-                             d_feature_types, row_stride, rows, cuts);
+      return EllpackPageImpl(&fmat_ctx_, value, missing, is_dense, row_counts_span, d_feature_types,
+                             row_stride, rows, cuts);
     });
-    size_t num_elements = ellpack_->Impl()->Copy(get_device(), &new_impl, offset);
+    std::size_t num_elements = ellpack_->Impl()->Copy(&fmat_ctx_, &new_impl, offset);
     offset += num_elements;
 
     proxy->Info().num_row_ = num_rows();
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index 550631b72dc5..62b39886ed62 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -226,7 +226,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
     }
     // An heuristic for number of pre-fetched batches.  We can make it part of BatchParam
     // to let user adjust number of pre-fetched batches when needed.
-    std::int32_t kPrefetches = 3;
+    std::int32_t constexpr kPrefetches = 3;
     std::int32_t n_prefetches = std::min(nthreads_, kPrefetches);
     n_prefetches = std::max(n_prefetches, 1);
     std::int32_t n_prefetch_batches = std::min(static_cast<bst_idx_t>(n_prefetches), n_batches_);
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index 71905debcb3a..2d288fa9d025 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -10,12 +10,12 @@
 
 #include <algorithm>
 #include <numeric>
-#include <sstream>
 #include <string>
 #include <vector>
 
 #include "../common/common.h"
-#include "../common/error_msg.h"  // NoCategorical, DeprecatedFunc
+#include "../common/cuda_rt_utils.h"  // for AllVisibleGPUs
+#include "../common/error_msg.h"      // NoCategorical, DeprecatedFunc
 #include "../common/threading_utils.h"
 #include "../common/timer.h"
 #include "gblinear_model.h"
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 9ff4abb4d00a..26c768fafea7 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023 by Contributors
+ * Copyright 2014-2024, XGBoost Contributors
  * \file gbtree.cc
  * \brief gradient boosted tree implementation.
  * \author Tianqi Chen
@@ -10,14 +10,14 @@
 #include <dmlc/parameter.h>
 
 #include <algorithm>  // for equal
-#include <cinttypes>  // for uint32_t
-#include <limits>
+#include <cstdint>    // for uint32_t
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "../common/common.h"
+#include "../common/cuda_rt_utils.h"  // for AllVisibleGPUs
 #include "../common/error_msg.h"  // for UnknownDevice, WarnOldSerialization, InplacePredictProxy
 #include "../common/random.h"
 #include "../common/threading_utils.h"
@@ -347,7 +347,7 @@ void GBTree::LoadConfig(Json const& in) {
   // This would cause all trees to be pushed to trees_to_update
   // e.g. updating a model, then saving and loading it would result in an empty model
   tparam_.process_type = TreeProcessType::kDefault;
-  std::int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
+  std::int32_t const n_gpus = common::AllVisibleGPUs();
 
   auto msg = StringView{
       R"(
diff --git a/src/learner.cc b/src/learner.cc
index 93db7f801407..542bf1dc6279 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1321,7 +1321,7 @@ class LearnerImpl : public LearnerIO {
     std::ostringstream os;
     os.precision(std::numeric_limits<double>::max_digits10);
     os << '[' << iter << ']' << std::setiosflags(std::ios::fixed);
-    if (metrics_.empty() && tparam_.disable_default_eval_metric <= 0) {
+    if (metrics_.empty() && !tparam_.disable_default_eval_metric) {
       metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric(), &ctx_));
       auto config = obj_->DefaultMetricConfig();
       if (!IsA<Null>(config)) {
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 29fb6bb6a162..fe46e19ec63b 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -16,6 +16,7 @@
 #include "../common/categorical.h"
 #include "../common/common.h"
 #include "../common/cuda_context.cuh"  // for CUDAContext
+#include "../common/cuda_rt_utils.h"   // for AllVisibleGPUs
 #include "../common/device_helpers.cuh"
 #include "../common/error_msg.h"  // for InplacePredictProxy
 #include "../data/device_adapter.cuh"
diff --git a/src/tree/fit_stump.cu b/src/tree/fit_stump.cu
index dd71465df1cc..4f1f994a6f38 100644
--- a/src/tree/fit_stump.cu
+++ b/src/tree/fit_stump.cu
@@ -3,9 +3,6 @@
  *
  * @brief Utilities for estimating initial score.
  */
-#if !defined(NOMINMAX) && defined(_WIN32)
-#define NOMINMAX
-#endif                                          // !defined(NOMINMAX)
 #include <thrust/execution_policy.h>            // cuda::par
 #include <thrust/iterator/counting_iterator.h>  // thrust::make_counting_iterator
 
diff --git a/src/tree/fit_stump.h b/src/tree/fit_stump.h
index 2af779f77c46..ab947a659464 100644
--- a/src/tree/fit_stump.h
+++ b/src/tree/fit_stump.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  *
  * \brief Utilities for estimating initial score.
  */
@@ -7,18 +7,12 @@
 #ifndef XGBOOST_TREE_FIT_STUMP_H_
 #define XGBOOST_TREE_FIT_STUMP_H_
 
-#if !defined(NOMINMAX) && defined(_WIN32)
-#define NOMINMAX
-#endif  // !defined(NOMINMAX)
-
 #include <algorithm>  // std::max
 
-#include "../common/common.h"            // AssertGPUSupport
-#include "xgboost/base.h"                // GradientPair
-#include "xgboost/context.h"             // Context
-#include "xgboost/data.h"                // MetaInfo
-#include "xgboost/host_device_vector.h"  // HostDeviceVector
-#include "xgboost/linalg.h"              // TensorView
+#include "xgboost/base.h"     // GradientPair
+#include "xgboost/context.h"  // Context
+#include "xgboost/data.h"     // MetaInfo
+#include "xgboost/linalg.h"   // TensorView
 
 namespace xgboost {
 namespace tree {
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu
index d2031ca21898..3235e9ec3ec1 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -163,14 +163,14 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
   if (!page_concatenated_) {
     // Concatenate all the external memory ELLPACK pages into a single in-memory page.
     page_.reset(nullptr);
-    size_t offset = 0;
+    bst_idx_t offset = 0;
     for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
       auto page = batch.Impl();
       if (!page_) {
-        page_ = std::make_unique<EllpackPageImpl>(ctx->Device(), page->CutsShared(), page->is_dense,
+        page_ = std::make_unique<EllpackPageImpl>(ctx, page->CutsShared(), page->is_dense,
                                                   page->row_stride, dmat->Info().num_row_);
       }
-      size_t num_elements = page_->Copy(ctx->Device(), page, offset);
+      bst_idx_t num_elements = page_->Copy(ctx, page, offset);
       offset += num_elements;
     }
     page_concatenated_ = true;
@@ -228,11 +228,11 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
   auto first_page = (*batch_iterator.begin()).Impl();
   // Create a new ELLPACK page with empty rows.
   page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(ctx->Device(), first_page->CutsShared(), first_page->is_dense,
+  page_.reset(new EllpackPageImpl(ctx, first_page->CutsShared(), first_page->is_dense,
                                   first_page->row_stride, sample_rows));
 
   // Compact the ELLPACK pages into the single sample page.
-  thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
+  thrust::fill(cuctx->CTP(), page_->gidx_buffer.begin(), page_->gidx_buffer.end(), 0);
   for (auto& batch : batch_iterator) {
     page_->Compact(ctx, batch.Impl(), dh::ToSpan(sample_row_index_));
   }
@@ -283,10 +283,10 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
   // Perform Poisson sampling in place.
   thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
                     thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
-                    PoissonSampling(dh::ToSpan(threshold_), threshold_index,
-                                    RandomWeight(common::GlobalRandom()())));
+                    PoissonSampling{dh::ToSpan(threshold_), threshold_index,
+                                    RandomWeight(common::GlobalRandom()())});
   // Count the sampled rows.
-  size_t sample_rows =
+  bst_idx_t sample_rows =
       thrust::count_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), IsNonZero());
   // Compact gradient pairs.
   gpair_.resize(sample_rows);
@@ -302,10 +302,10 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
   auto first_page = (*batch_iterator.begin()).Impl();
   // Create a new ELLPACK page with empty rows.
   page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(ctx->Device(), first_page->CutsShared(), dmat->IsDense(),
-                                  first_page->row_stride, sample_rows));
+  page_.reset(new EllpackPageImpl{ctx, first_page->CutsShared(), dmat->IsDense(),
+                                  first_page->row_stride, sample_rows});
   // Compact the ELLPACK pages into the single sample page.
-  thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
+  thrust::fill(cuctx->CTP(), page_->gidx_buffer.begin(), page_->gidx_buffer.end(), 0);
   for (auto& batch : batch_iterator) {
     page_->Compact(ctx, batch.Impl(), dh::ToSpan(sample_row_index_));
   }
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cuh b/src/tree/gpu_hist/gradient_based_sampler.cuh
index 5a57e2ae8ef8..79008b1ae572 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cuh
+++ b/src/tree/gpu_hist/gradient_based_sampler.cuh
@@ -1,20 +1,19 @@
 /**
- * Copyright 2019-2023, XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #pragma once
-#include <xgboost/base.h>
-#include <xgboost/data.h>
-#include <xgboost/span.h>
+#include <cstddef>  // for size_t
 
-#include "../../common/device_helpers.cuh"
-#include "../../data/ellpack_page.cuh"
-
-namespace xgboost {
-namespace tree {
+#include "../../common/device_vector.cuh"  // for device_vector, caching_device_vector
+#include "../../data/ellpack_page.cuh"     // for EllpackPageImpl
+#include "xgboost/base.h"                  // for GradientPair
+#include "xgboost/data.h"                  // for BatchParam
+#include "xgboost/span.h"                  // for Span
 
+namespace xgboost::tree {
 struct GradientBasedSample {
   /*!\brief Number of sampled rows. */
-  size_t sample_rows;
+  std::size_t sample_rows;
   /*!\brief Sampled rows in ELLPACK format. */
   EllpackPageImpl const* page;
   /*!\brief Gradient pairs for the sampled rows. */
@@ -137,5 +136,4 @@ class GradientBasedSampler {
   common::Monitor monitor_;
   std::unique_ptr<SamplingStrategy> strategy_;
 };
-};  // namespace tree
-};  // namespace xgboost
+};  // namespace xgboost::tree
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 19957857218d..7d566c3b40ae 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -16,7 +16,8 @@
 #include "../collective/broadcast.h"
 #include "../common/bitfield.h"
 #include "../common/categorical.h"
-#include "../common/cuda_context.cuh"  // CUDAContext
+#include "../common/cuda_context.cuh"  // for CUDAContext
+#include "../common/cuda_rt_utils.h"   // for CheckComputeCapability
 #include "../common/device_helpers.cuh"
 #include "../common/hist_util.h"
 #include "../common/random.h"  // for ColumnSampler, GlobalRandom
@@ -826,7 +827,7 @@ class GPUHistMaker : public TreeUpdater {
     // Used in test to count how many configurations are performed
     LOG(DEBUG) << "[GPU Hist]: Configure";
     hist_maker_param_.UpdateAllowUnknown(args);
-    dh::CheckComputeCapability();
+    common::CheckComputeCapability();
     initialised_ = false;
 
     monitor_.Init("updater_gpu_hist");
@@ -852,17 +853,13 @@ class GPUHistMaker : public TreeUpdater {
     CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
     auto gpair_hdv = gpair->Data();
     // build tree
-    try {
-      std::size_t t_idx{0};
-      for (xgboost::RegTree* tree : trees) {
-        this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]);
-        this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree);
-        ++t_idx;
-      }
-      dh::safe_cuda(cudaGetLastError());
-    } catch (const std::exception& e) {
-      LOG(FATAL) << "Exception in gpu_hist: " << e.what() << std::endl;
+    std::size_t t_idx{0};
+    for (xgboost::RegTree* tree : trees) {
+      this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]);
+      this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree);
+      ++t_idx;
     }
+    dh::safe_cuda(cudaGetLastError());
     monitor_.Stop("Update");
   }
 
@@ -958,7 +955,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
     if (hist_maker_param_.max_cached_hist_node != HistMakerTrainParam::DefaultNodes()) {
       LOG(WARNING) << "The `max_cached_hist_node` is ignored in GPU.";
     }
-    dh::CheckComputeCapability();
+    common::CheckComputeCapability();
     initialised_ = false;
 
     monitor_.Init(this->Name());
diff --git a/tests/cpp/collective/test_worker.h b/tests/cpp/collective/test_worker.h
index 66c6ce9bf24e..2430911904e8 100644
--- a/tests/cpp/collective/test_worker.h
+++ b/tests/cpp/collective/test_worker.h
@@ -15,7 +15,7 @@
 #include "../../../src/collective/comm.h"
 #include "../../../src/collective/communicator-inl.h"  // for Init, Finalize
 #include "../../../src/collective/tracker.h"           // for GetHostAddress
-#include "../../../src/common/common.h"                // for AllVisibleGPUs
+#include "../../../src/common/cuda_rt_utils.h"         // for AllVisibleGPUs
 #include "../helpers.h"                                // for FileExists
 
 #if defined(XGBOOST_USE_FEDERATED)
diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu
index a0aa5fa11fce..c730390c37d8 100644
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -4,9 +4,10 @@
 #include <gtest/gtest.h>
 #include <thrust/equal.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <xgboost/host_device_vector.h>
 
+#include "../../../src/common/cuda_rt_utils.h"  // for SetDevice
 #include "../../../src/common/device_helpers.cuh"
-#include <xgboost/host_device_vector.h>
 
 namespace xgboost::common {
 namespace {
diff --git a/tests/cpp/common/test_ref_resource_view.cc b/tests/cpp/common/test_ref_resource_view.cc
index 9ae55fdec7f4..b201f69139d3 100644
--- a/tests/cpp/common/test_ref_resource_view.cc
+++ b/tests/cpp/common/test_ref_resource_view.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
@@ -16,17 +16,16 @@ TEST(RefResourceView, Basic) {
   std::size_t n_bytes = 1024;
   auto mem = std::make_shared<MallocResource>(n_bytes);
   {
-    RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem};
+    RefResourceView view{static_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem};
 
-    RefResourceView kview{reinterpret_cast<float const*>(mem->Data()), mem->Size() / sizeof(float),
-                          mem};
+    RefResourceView kview{static_cast<float const*>(mem->Data()), mem->Size() / sizeof(float), mem};
     ASSERT_EQ(mem.use_count(), 3);
     ASSERT_EQ(view.size(), n_bytes / sizeof(1024));
     ASSERT_EQ(kview.size(), n_bytes / sizeof(1024));
   }
   {
-    RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem,
-                         1.5f};
+    RefResourceView view{static_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem};
+    std::fill_n(static_cast<float*>(mem->Data()), mem->Size() / sizeof(float), 1.5f);
     for (auto v : view) {
       ASSERT_EQ(v, 1.5f);
     }
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
index 9d9687dda81b..8aab51b7202e 100644
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -27,15 +27,15 @@ TEST(EllpackPage, EmptyDMatrix) {
   auto impl = page.Impl();
   ASSERT_EQ(impl->row_stride, 0);
   ASSERT_EQ(impl->Cuts().TotalBins(), 0);
-  ASSERT_EQ(impl->gidx_buffer.Size(), 4);
+  ASSERT_EQ(impl->gidx_buffer.size(), 4);
 }
 
 TEST(EllpackPage, BuildGidxDense) {
   int constexpr kNRows = 16, kNCols = 8;
-  auto page = BuildEllpackPage(kNRows, kNCols);
-
-  std::vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.HostVector());
-  common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), page->NumSymbols());
+  auto ctx = MakeCUDACtx(0);
+  auto page = BuildEllpackPage(&ctx, kNRows, kNCols);
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer);
 
   ASSERT_EQ(page->row_stride, kNCols);
 
@@ -58,16 +58,17 @@ TEST(EllpackPage, BuildGidxDense) {
     1, 4, 7, 10, 14, 16, 19, 21,
   };
   for (size_t i = 0; i < kNRows * kNCols; ++i) {
-    ASSERT_EQ(solution[i], gidx[i]);
+    ASSERT_EQ(solution[i], h_accessor.gidx_iter[i]);
   }
 }
 
 TEST(EllpackPage, BuildGidxSparse) {
   int constexpr kNRows = 16, kNCols = 8;
-  auto page = BuildEllpackPage(kNRows, kNCols, 0.9f);
+  auto ctx = MakeCUDACtx(0);
+  auto page = BuildEllpackPage(&ctx, kNRows, kNCols, 0.9f);
 
-  std::vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.HostVector());
-  common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer);
 
   ASSERT_LE(page->row_stride, 3);
 
@@ -78,7 +79,7 @@ TEST(EllpackPage, BuildGidxSparse) {
     24,  7, 14, 16,  4, 24, 24, 24, 24, 24,  9, 24, 24,  1, 24, 24
   };
   for (size_t i = 0; i < kNRows * page->row_stride; ++i) {
-    ASSERT_EQ(solution[i], gidx[i]);
+    ASSERT_EQ(solution[i], h_accessor.gidx_iter[i]);
   }
 }
 
@@ -94,7 +95,7 @@ TEST(EllpackPage, FromCategoricalBasic) {
   Context ctx{MakeCUDACtx(0)};
   auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()};
   auto ellpack = EllpackPage(&ctx, m.get(), p);
-  auto accessor = ellpack.Impl()->GetDeviceAccessor(FstCU());
+  auto accessor = ellpack.Impl()->GetDeviceAccessor(ctx.Device());
   ASSERT_EQ(kCats, accessor.NumBins());
 
   auto x_copy = x;
@@ -110,13 +111,11 @@ TEST(EllpackPage, FromCategoricalBasic) {
   ASSERT_EQ(h_cuts_ptr.size(), 2);
   ASSERT_EQ(h_cuts_values.size(), kCats);
 
-  std::vector<common::CompressedByteT> const &h_gidx_buffer =
-      ellpack.Impl()->gidx_buffer.HostVector();
-  auto h_gidx_iter = common::CompressedIterator<uint32_t>(
-      h_gidx_buffer.data(), accessor.NumSymbols());
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  auto h_accessor = ellpack.Impl()->GetHostAccessor(&ctx, &h_gidx_buffer);
 
   for (size_t i = 0; i < x.size(); ++i) {
-    auto bin = h_gidx_iter[i];
+    auto bin = h_accessor.gidx_iter[i];
     auto bin_value = h_cuts_values.at(bin);
     ASSERT_EQ(AsCat(x[i]), AsCat(bin_value));
   }
@@ -152,12 +151,12 @@ TEST(EllpackPage, Copy) {
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
 
   // Create an empty result page.
-  EllpackPageImpl result(FstCU(), page->CutsShared(), page->is_dense, page->row_stride, kRows);
+  EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->row_stride, kRows);
 
   // Copy batch pages into the result page.
   size_t offset = 0;
   for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
-    size_t num_elements = result.Copy(FstCU(), batch.Impl(), offset);
+    size_t num_elements = result.Copy(&ctx, batch.Impl(), offset);
     offset += num_elements;
   }
 
@@ -171,11 +170,11 @@ TEST(EllpackPage, Copy) {
     EXPECT_EQ(impl->base_rowid, current_row);
 
     for (size_t i = 0; i < impl->Size(); i++) {
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()), current_row,
+      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
                                          row_d.data().get()));
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
-      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(FstCU()), current_row,
+      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(ctx.Device()), current_row,
                                          row_result_d.data().get()));
       thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
 
@@ -200,7 +199,7 @@ TEST(EllpackPage, Compact) {
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
 
   // Create an empty result page.
-  EllpackPageImpl result(ctx.Device(), page->CutsShared(), page->is_dense, page->row_stride,
+  EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->row_stride,
                          kCompactedRows);
 
   // Compact batch pages into the result page.
@@ -229,14 +228,13 @@ TEST(EllpackPage, Compact) {
         continue;
       }
 
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()),
-                                         current_row, row_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
+                                         row_d.data().get()));
       dh::safe_cuda(cudaDeviceSynchronize());
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
-      dh::LaunchN(kCols,
-                  ReadRowFunction(result.GetDeviceAccessor(FstCU()), compacted_row,
-                                  row_result_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(ctx.Device()), compacted_row,
+                                         row_result_d.data().get()));
       thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
 
       EXPECT_EQ(row, row_result);
@@ -269,16 +267,13 @@ class EllpackPageTest : public testing::TestWithParam<float> {
       ASSERT_EQ(from_sparse_page->base_rowid, 0);
       ASSERT_EQ(from_sparse_page->base_rowid, from_ghist->base_rowid);
       ASSERT_EQ(from_sparse_page->n_rows, from_ghist->n_rows);
-      ASSERT_EQ(from_sparse_page->gidx_buffer.Size(), from_ghist->gidx_buffer.Size());
-      auto const& h_gidx_from_sparse = from_sparse_page->gidx_buffer.HostVector();
-      auto const& h_gidx_from_ghist = from_ghist->gidx_buffer.HostVector();
+      ASSERT_EQ(from_sparse_page->gidx_buffer.size(), from_ghist->gidx_buffer.size());
+      std::vector<common::CompressedByteT> h_gidx_from_sparse, h_gidx_from_ghist;
+      auto from_ghist_acc = from_ghist->GetHostAccessor(&gpu_ctx, &h_gidx_from_ghist);
+      auto from_sparse_acc = from_sparse_page->GetHostAccessor(&gpu_ctx, &h_gidx_from_sparse);
       ASSERT_EQ(from_sparse_page->NumSymbols(), from_ghist->NumSymbols());
-      common::CompressedIterator<uint32_t> from_ghist_it(h_gidx_from_ghist.data(),
-                                                         from_ghist->NumSymbols());
-      common::CompressedIterator<uint32_t> from_sparse_it(h_gidx_from_sparse.data(),
-                                                          from_sparse_page->NumSymbols());
       for (size_t i = 0; i < from_ghist->n_rows * from_ghist->row_stride; ++i) {
-        EXPECT_EQ(from_ghist_it[i], from_sparse_it[i]);
+        EXPECT_EQ(from_ghist_acc.gidx_iter[i], from_sparse_acc.gidx_iter[i]);
       }
     }
   }
diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu
index d5ff721f869a..b7bb5f902c6c 100644
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -14,9 +14,8 @@
 namespace xgboost::data {
 namespace {
 template <typename FormatStreamPolicy>
-void TestEllpackPageRawFormat() {
-  FormatStreamPolicy policy;
-
+void TestEllpackPageRawFormat(FormatStreamPolicy *p_policy) {
+  auto &policy = *p_policy;
   Context ctx{MakeCUDACtx(0)};
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
 
@@ -55,16 +54,30 @@ void TestEllpackPageRawFormat() {
     ASSERT_EQ(loaded->Cuts().Values(), orig->Cuts().Values());
     ASSERT_EQ(loaded->base_rowid, orig->base_rowid);
     ASSERT_EQ(loaded->row_stride, orig->row_stride);
-    ASSERT_EQ(loaded->gidx_buffer.HostVector(), orig->gidx_buffer.HostVector());
+    std::vector<common::CompressedByteT> h_loaded, h_orig;
+    [[maybe_unused]] auto h_loaded_acc = loaded->GetHostAccessor(&ctx, &h_loaded);
+    [[maybe_unused]] auto h_orig_acc = orig->GetHostAccessor(&ctx, &h_orig);
+    ASSERT_EQ(h_loaded, h_orig);
   }
 }
 }  // anonymous namespace
 
 TEST(EllpackPageRawFormat, DiskIO) {
-  TestEllpackPageRawFormat<DefaultFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>();
+  EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy> policy{false};
+  TestEllpackPageRawFormat(&policy);
+}
+
+TEST(EllpackPageRawFormat, DiskIOHmm) {
+  if (common::SupportsPageableMem()) {
+    EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy> policy{true};
+    TestEllpackPageRawFormat(&policy);
+  } else {
+    GTEST_SKIP_("HMM is not supported.");
+  }
 }
 
 TEST(EllpackPageRawFormat, HostIO) {
-  TestEllpackPageRawFormat<EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>();
+  EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy> policy;
+  TestEllpackPageRawFormat(&policy);
 }
 }  // namespace xgboost::data
diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu
index 503cb76965e6..5fb90a5c1526 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023, XGBoost contributors
+ * Copyright 2020-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 
@@ -21,10 +21,10 @@ void TestEquivalent(float sparsity) {
   std::size_t offset = 0;
   auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl();
   std::unique_ptr<EllpackPageImpl> page_concatenated{new EllpackPageImpl(
-      ctx.Device(), first->CutsShared(), first->is_dense, first->row_stride, 1000 * 100)};
+      &ctx, first->CutsShared(), first->is_dense, first->row_stride, 1000 * 100)};
   for (auto& batch : m.GetBatches<EllpackPage>(&ctx, {})) {
     auto page = batch.Impl();
-    size_t num_elements = page_concatenated->Copy(ctx.Device(), page, offset);
+    size_t num_elements = page_concatenated->Copy(&ctx, page, offset);
     offset += num_elements;
   }
   auto from_iter = page_concatenated->GetDeviceAccessor(ctx.Device());
@@ -66,18 +66,15 @@ void TestEquivalent(float sparsity) {
       ASSERT_EQ(cut_ptrs_iter[i], cut_ptrs_data[i]);
     }
 
-    auto const& buffer_from_iter = page_concatenated->gidx_buffer;
-    auto const& buffer_from_data = ellpack.Impl()->gidx_buffer;
-    ASSERT_NE(buffer_from_data.Size(), 0);
-
-    common::CompressedIterator<uint32_t> data_buf{
-        buffer_from_data.ConstHostPointer(), from_data.NumSymbols()};
-    common::CompressedIterator<uint32_t> data_iter{
-        buffer_from_iter.ConstHostPointer(), from_iter.NumSymbols()};
+    std::vector<common::CompressedByteT> buffer_from_iter, buffer_from_data;
+    auto data_iter = page_concatenated->GetHostAccessor(&ctx, &buffer_from_iter);
+    auto data_buf = ellpack.Impl()->GetHostAccessor(&ctx, &buffer_from_data);
+    ASSERT_NE(buffer_from_data.size(), 0);
+    ASSERT_NE(buffer_from_iter.size(), 0);
     CHECK_EQ(from_data.NumSymbols(), from_iter.NumSymbols());
     CHECK_EQ(from_data.n_rows * from_data.row_stride, from_data.n_rows * from_iter.row_stride);
     for (size_t i = 0; i < from_data.n_rows * from_data.row_stride; ++i) {
-      CHECK_EQ(data_buf[i], data_iter[i]);
+      CHECK_EQ(data_buf.gidx_iter[i], data_iter.gidx_iter[i]);
     }
   }
 }
@@ -97,8 +94,8 @@ TEST(IterativeDeviceDMatrix, RowMajor) {
   for (auto& ellpack : m.GetBatches<EllpackPage>(&ctx, {})) {
     n_batches ++;
     auto impl = ellpack.Impl();
-    common::CompressedIterator<uint32_t> iterator(
-        impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
+    std::vector<common::CompressedByteT> h_gidx;
+    auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx);
     auto cols = CudaArrayIterForTest::Cols();
     auto rows = CudaArrayIterForTest::Rows();
 
@@ -111,7 +108,7 @@ TEST(IterativeDeviceDMatrix, RowMajor) {
 
     for(auto i = 0ull; i < rows * cols; i++) {
       int column_idx = i % cols;
-      EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx), iterator[i]);
+      EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx), h_accessor.gidx_iter[i]);
     }
     EXPECT_EQ(m.Info().num_col_, cols);
     EXPECT_EQ(m.Info().num_row_, rows);
@@ -147,12 +144,12 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
       *m.GetBatches<EllpackPage>(&ctx, BatchParam{256, tree::TrainParam::DftSparseThreshold()})
            .begin();
   auto impl = ellpack.Impl();
-  common::CompressedIterator<uint32_t> iterator(
-      impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
-  EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(ctx.Device()).NullValue());
-  EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(ctx.Device()).NullValue());
+  std::vector<common::CompressedByteT> h_gidx;
+  auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx);
+  EXPECT_EQ(h_accessor.gidx_iter[1], impl->GetDeviceAccessor(ctx.Device()).NullValue());
+  EXPECT_EQ(h_accessor.gidx_iter[5], impl->GetDeviceAccessor(ctx.Device()).NullValue());
   // null values get placed after valid values in a row
-  EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(ctx.Device()).NullValue());
+  EXPECT_EQ(h_accessor.gidx_iter[7], impl->GetDeviceAccessor(ctx.Device()).NullValue());
   EXPECT_EQ(m.Info().num_col_, cols);
   EXPECT_EQ(m.Info().num_row_, rows);
   EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3);
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index 327f2ba635fd..046c4eed4d80 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -154,13 +154,18 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
   for (auto it = begin; it != end; ++it) {
     iterators.push_back(it.Page());
     gidx_buffers.emplace_back();
-    gidx_buffers.back().Resize((*it).Impl()->gidx_buffer.Size());
-    gidx_buffers.back().Copy((*it).Impl()->gidx_buffer);
+    gidx_buffers.back().SetDevice(ctx.Device());
+    gidx_buffers.back().Resize((*it).Impl()->gidx_buffer.size());
+    auto d_dst = gidx_buffers.back().DevicePointer();
+    auto const& d_src = (*it).Impl()->gidx_buffer;
+    dh::safe_cuda(cudaMemcpyAsync(d_dst, d_src.data(), d_src.size_bytes(), cudaMemcpyDefault));
   }
   ASSERT_GE(iterators.size(), 2);
 
   for (size_t i = 0; i < iterators.size(); ++i) {
-    ASSERT_EQ((*iterators[i]).Impl()->gidx_buffer.HostVector(), gidx_buffers.at(i).HostVector());
+    std::vector<common::CompressedByteT> h_buf;
+    [[maybe_unused]] auto h_acc = (*iterators[i]).Impl()->GetHostAccessor(&ctx, &h_buf);
+    ASSERT_EQ(h_buf, gidx_buffers.at(i).HostVector());
     ASSERT_EQ(iterators[i].use_count(), 1);
   }
 
@@ -210,11 +215,11 @@ class TestEllpackPageExt : public ::testing::TestWithParam<std::tuple<bool, bool
     size_t offset = 0;
     for (auto& batch : p_ext_fmat->GetBatches<EllpackPage>(&ctx, param)) {
       if (!impl_ext) {
-        impl_ext = std::make_unique<EllpackPageImpl>(
-            batch.Impl()->gidx_buffer.Device(), batch.Impl()->CutsShared(), batch.Impl()->is_dense,
-            batch.Impl()->row_stride, kRows);
+        impl_ext = std::make_unique<EllpackPageImpl>(&ctx, batch.Impl()->CutsShared(),
+                                                     batch.Impl()->is_dense,
+                                                     batch.Impl()->row_stride, kRows);
       }
-      auto n_elems = impl_ext->Copy(ctx.Device(), batch.Impl(), offset);
+      auto n_elems = impl_ext->Copy(&ctx, batch.Impl(), offset);
       offset += n_elems;
     }
     ASSERT_EQ(impl_ext->base_rowid, 0);
@@ -223,8 +228,10 @@ class TestEllpackPageExt : public ::testing::TestWithParam<std::tuple<bool, bool
     ASSERT_EQ(impl_ext->row_stride, 2);
     ASSERT_EQ(impl_ext->Cuts().TotalBins(), 4);
 
-    std::vector<common::CompressedByteT> buffer(impl->gidx_buffer.HostVector());
-    std::vector<common::CompressedByteT> buffer_ext(impl_ext->gidx_buffer.HostVector());
+    std::vector<common::CompressedByteT> buffer;
+    [[maybe_unused]] auto h_acc = impl->GetHostAccessor(&ctx, &buffer);
+    std::vector<common::CompressedByteT> buffer_ext;
+    [[maybe_unused]] auto h_ext_acc = impl_ext->GetHostAccessor(&ctx, &buffer_ext);
     ASSERT_EQ(buffer, buffer_ext);
   }
 };
diff --git a/tests/cpp/filesystem.h b/tests/cpp/filesystem.h
index c8d144291b0f..fafc8c7d1bf9 100644
--- a/tests/cpp/filesystem.h
+++ b/tests/cpp/filesystem.h
@@ -1,13 +1,10 @@
-/*!
- * Copyright (c) 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2024, XGBoost Contributors
  */
 #ifndef XGBOOST_TESTS_CPP_FILESYSTEM_H
 #define XGBOOST_TESTS_CPP_FILESYSTEM_H
 
-// A macro used inside `windows.h` to avoid conflicts with `winsock2.h`
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif  // WIN32_LEAN_AND_MEAN
+#include <xgboost/windefs.h>
 
 #include "dmlc/filesystem.h"
 
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 2211b2d00cb2..b2e9e08cd80c 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -21,14 +21,11 @@
 
 #if defined(__CUDACC__)
 #include "../../src/collective/communicator-inl.h"  // for GetRank
-#include "../../src/common/common.h"                // for AllVisibleGPUs
+#include "../../src/common/cuda_rt_utils.h"         // for AllVisibleGPUs
 #endif  // defined(__CUDACC__)
 
 #include "filesystem.h"  // dmlc::TemporaryDirectory
 #include "xgboost/linalg.h"
-#if !defined(_OPENMP)
-#include <thread>
-#endif
 
 #if defined(__CUDACC__)
 #define DeclareUnifiedTest(name) GPU ## name
diff --git a/tests/cpp/histogram_helpers.h b/tests/cpp/histogram_helpers.h
index a33d6958ae3f..ff021e819821 100644
--- a/tests/cpp/histogram_helpers.h
+++ b/tests/cpp/histogram_helpers.h
@@ -23,7 +23,7 @@ class HistogramCutsWrapper : public common::HistogramCuts {
 };
 }  // namespace detail
 
-inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(int n_rows, int n_cols,
+inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(Context const* ctx, int n_rows, int n_cols,
                                                          bst_float sparsity = 0) {
   auto dmat = RandomDataGenerator(n_rows, n_cols, sparsity).Seed(3).GenerateDMatrix();
   const SparsePage& batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
@@ -48,7 +48,7 @@ inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(int n_rows, int n_cols,
   }
 
   auto page = std::unique_ptr<EllpackPageImpl>(
-      new EllpackPageImpl(DeviceOrd::CUDA(0), cmat, batch, dmat->IsDense(), row_stride, {}));
+      new EllpackPageImpl(ctx, cmat, batch, dmat->IsDense(), row_stride, {}));
 
   return page;
 }
diff --git a/tests/cpp/objective/test_aft_obj.cc b/tests/cpp/objective/test_aft_obj.cc
index 972dfc53f58e..f31debb21af9 100644
--- a/tests/cpp/objective/test_aft_obj.cc
+++ b/tests/cpp/objective/test_aft_obj.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023, XGBoost Contributors 
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <memory>
@@ -10,7 +10,6 @@
 #include "xgboost/objective.h"
 #include "xgboost/logging.h"
 #include "../helpers.h"
-#include "../../../src/common/survival_util.h"
 
 namespace xgboost::common {
 TEST(Objective, DeclareUnifiedTest(AFTObjConfiguration)) {
diff --git a/tests/cpp/plugin/federated/test_federated_coll.cu b/tests/cpp/plugin/federated/test_federated_coll.cu
index f3b9066133cc..31760a97f1fe 100644
--- a/tests/cpp/plugin/federated/test_federated_coll.cu
+++ b/tests/cpp/plugin/federated/test_federated_coll.cu
@@ -6,7 +6,7 @@
 #include <xgboost/collective/result.h>  // for Result
 
 #include "../../../../src/collective/allreduce.h"
-#include "../../../../src/common/common.h"            // for AllVisibleGPUs
+#include "../../../../src/common/cuda_rt_utils.h"     // for AllVisibleGPUs
 #include "../../../../src/common/device_helpers.cuh"  // for device_vector
 #include "../../../../src/common/type.h"              // for EraseType
 #include "../../collective/test_worker.h"             // for SocketTest
diff --git a/tests/cpp/plugin/federated/test_federated_comm_group.cc b/tests/cpp/plugin/federated/test_federated_comm_group.cc
index 9bfbdd3ae1b4..511b3d8d11a8 100644
--- a/tests/cpp/plugin/federated/test_federated_comm_group.cc
+++ b/tests/cpp/plugin/federated/test_federated_comm_group.cc
@@ -1,11 +1,11 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/json.h>  // for Json
 
 #include "../../../../src/collective/comm_group.h"
-#include "../../helpers.h"
+#include "../../../../src/common/cuda_rt_utils.h"  // for AllVisibleGPUs
 #include "test_worker.h"
 
 namespace xgboost::collective {
diff --git a/tests/cpp/plugin/federated/test_federated_comm_group.cu b/tests/cpp/plugin/federated/test_federated_comm_group.cu
index 747adb6fd87e..c6fd8921c0bb 100644
--- a/tests/cpp/plugin/federated/test_federated_comm_group.cu
+++ b/tests/cpp/plugin/federated/test_federated_comm_group.cu
@@ -1,10 +1,11 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/json.h>  // for Json
 
 #include "../../../../src/collective/comm_group.h"
+#include "../../../../src/common/cuda_rt_utils.h"  // for AllVisibleGPUs
 #include "../../helpers.h"
 #include "test_worker.h"
 
diff --git a/tests/cpp/test_context.cu b/tests/cpp/test_context.cu
index 7684ff4672cd..0776980353f1 100644
--- a/tests/cpp/test_context.cu
+++ b/tests/cpp/test_context.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/base.h>  // for Args
@@ -8,7 +8,7 @@
 
 #include <string>  // for string, to_string
 
-#include "../../src/common/common.h"  // for AllVisibleGPUs
+#include "../../src/common/cuda_rt_utils.h"  // for AllVisibleGPUs
 
 namespace xgboost {
 namespace {
diff --git a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
index 9a0304f87d58..85bea39c5f5c 100644
--- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
+++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023, XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
@@ -102,19 +102,17 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
   EXPECT_EQ(sample.gpair.data(), gpair.DevicePointer());
   EXPECT_EQ(sampled_page->n_rows, kRows);
 
-  std::vector<common::CompressedByteT> buffer(sampled_page->gidx_buffer.HostVector());
-  common::CompressedIterator<common::CompressedByteT>
-      ci(buffer.data(), sampled_page->NumSymbols());
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  auto h_accessor = sampled_page->GetHostAccessor(&ctx, &h_gidx_buffer);
 
-  size_t offset = 0;
+  std::size_t offset = 0;
   for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
     auto page = batch.Impl();
-    std::vector<common::CompressedByteT> page_buffer(page->gidx_buffer.HostVector());
-    common::CompressedIterator<common::CompressedByteT>
-        page_ci(page_buffer.data(), page->NumSymbols());
+    std::vector<common::CompressedByteT> h_page_gidx_buffer;
+    auto page_accessor = page->GetHostAccessor(&ctx, &h_page_gidx_buffer);
     size_t num_elements = page->n_rows * page->row_stride;
     for (size_t i = 0; i < num_elements; i++) {
-      EXPECT_EQ(ci[i + offset], page_ci[i]);
+      EXPECT_EQ(h_accessor.gidx_iter[i + offset], page_accessor.gidx_iter[i]);
     }
     offset += num_elements;
   }
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index d1128446617b..860e4bfd4ea0 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -328,8 +328,7 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
       for (auto const& page : p_fmat->GetBatches<SparsePage>()) {
         concat.Push(page);
       }
-      EllpackPageImpl page{
-          ctx.Device(), cuts, concat, p_fmat->IsDense(), p_fmat->Info().num_col_, {}};
+      EllpackPageImpl page{&ctx, cuts, concat, p_fmat->IsDense(), p_fmat->Info().num_col_, {}};
       auto ridx = partitioner.GetRows(0);
       auto d_histogram = dh::ToSpan(single_hist);
       DeviceHistogramBuilder builder;
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 200fb39fb4e9..291b46edea36 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -81,6 +81,7 @@ std::vector<GradientPairPrecise> GetHostHistGpair() {
 template <typename GradientSumT>
 void TestBuildHist(bool use_shared_memory_histograms) {
   int const kNRows = 16, kNCols = 8;
+  Context ctx{MakeCUDACtx(0)};
 
   TrainParam param;
   Args args{
@@ -89,9 +90,8 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   };
   param.Init(args);
 
-  auto page = BuildEllpackPage(kNRows, kNCols);
+  auto page = BuildEllpackPage(&ctx, kNRows, kNCols);
   BatchParam batch_param{};
-  Context ctx{MakeCUDACtx(0)};
   auto cs = std::make_shared<common::ColumnSampler>(0);
   GPUHistMakerDevice maker(&ctx, /*is_external_memory=*/false, {}, kNRows, param, cs, kNCols,
                            batch_param, MetaInfo());
@@ -105,7 +105,6 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   }
   gpair.SetDevice(ctx.Device());
 
-  thrust::host_vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.HostVector());
   maker.row_partitioner = std::make_unique<RowPartitioner>(&ctx, kNRows, 0);
 
   maker.hist.Init(ctx.Device(), page->Cuts().TotalBins());
@@ -198,14 +197,12 @@ void TestHistogramIndexImpl() {
   auto grad = GenerateRandomGradients(kNRows);
   grad.SetDevice(DeviceOrd::CUDA(0));
   maker->Reset(&grad, hist_maker_dmat.get(), kNCols);
-  std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.HostVector());
 
   const auto &maker_ext = hist_maker_ext.maker;
   maker_ext->Reset(&grad, hist_maker_ext_dmat.get(), kNCols);
-  std::vector<common::CompressedByteT> h_gidx_buffer_ext(maker_ext->page->gidx_buffer.HostVector());
 
   ASSERT_EQ(maker->page->Cuts().TotalBins(), maker_ext->page->Cuts().TotalBins());
-  ASSERT_EQ(maker->page->gidx_buffer.Size(), maker_ext->page->gidx_buffer.Size());
+  ASSERT_EQ(maker->page->gidx_buffer.size(), maker_ext->page->gidx_buffer.size());
 }
 
 TEST(GpuHist, TestHistogramIndex) {

From 7ab93f3ce3edea5a409bc593857ea094447776ca Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 19 Jul 2024 01:04:17 +0800
Subject: [PATCH 44/51] [CI] Fix test environment. (#10609)

* [CI] Fix test environment.

* Remove shell.

* Remove.

* Update Dockerfile.i386
---
 .github/workflows/i386.yml                  | 2 +-
 tests/ci_build/Dockerfile.i386              | 2 +-
 tests/ci_build/conda_env/macos_cpu_test.yml | 1 -
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml
index a7c71a590cb8..78dc92bffff6 100644
--- a/.github/workflows/i386.yml
+++ b/.github/workflows/i386.yml
@@ -23,7 +23,7 @@ jobs:
       with:
         submodules: 'true'
     - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v3
+      uses: docker/setup-buildx-action@v3.4.0
       with:
         driver-opts: network=host
     - name: Build and push container
diff --git a/tests/ci_build/Dockerfile.i386 b/tests/ci_build/Dockerfile.i386
index d7c133e2aee4..a582a54020bb 100644
--- a/tests/ci_build/Dockerfile.i386
+++ b/tests/ci_build/Dockerfile.i386
@@ -1,7 +1,7 @@
 FROM i386/debian:sid
 
 ENV DEBIAN_FRONTEND noninteractive
-SHELL ["/bin/bash", "-c"]   # Use Bash as shell
+SHELL ["/bin/bash", "-c"]
 
 RUN \
     apt-get update && \
diff --git a/tests/ci_build/conda_env/macos_cpu_test.yml b/tests/ci_build/conda_env/macos_cpu_test.yml
index ce9ca4b1ba7c..e2e377e2145d 100644
--- a/tests/ci_build/conda_env/macos_cpu_test.yml
+++ b/tests/ci_build/conda_env/macos_cpu_test.yml
@@ -38,4 +38,3 @@ dependencies:
 - cloudpickle
 - pip:
   - sphinx_rtd_theme
-  - py-ubjson

From 326921dbe49d1ad4cdcdc967e91713dac7dc6356 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Fri, 19 Jul 2024 10:51:08 -0700
Subject: [PATCH 45/51] [CI] Build a CPU-only wheel under name `xgboost-cpu`
 (#10603)

---
 dev/release-artifacts.py                      | 14 +++++
 doc/install.rst                               | 13 ++++
 python-package/pyproject.toml                 |  6 +-
 .../buildkite/build-manylinux2014-aarch64.sh  | 33 ----------
 tests/buildkite/build-manylinux2014-x86_64.sh | 33 ----------
 tests/buildkite/build-manylinux2014.sh        | 63 +++++++++++++++++++
 tests/buildkite/cpu_only_pypkg.patch          | 55 ++++++++++++++++
 tests/buildkite/manylinux2014_warning.patch   | 14 -----
 tests/buildkite/pipeline.yml                  |  4 +-
 tests/buildkite/remove_nccl_dep.patch         | 14 +++++
 10 files changed, 164 insertions(+), 85 deletions(-)
 delete mode 100644 tests/buildkite/build-manylinux2014-aarch64.sh
 delete mode 100644 tests/buildkite/build-manylinux2014-x86_64.sh
 create mode 100755 tests/buildkite/build-manylinux2014.sh
 create mode 100644 tests/buildkite/cpu_only_pypkg.patch
 create mode 100644 tests/buildkite/remove_nccl_dep.patch

diff --git a/dev/release-artifacts.py b/dev/release-artifacts.py
index 1e0b5723e89b..f53ef134630b 100644
--- a/dev/release-artifacts.py
+++ b/dev/release-artifacts.py
@@ -2,6 +2,7 @@
 
 tqdm, sh are required to run this script.
 """
+
 import argparse
 import os
 import shutil
@@ -106,6 +107,15 @@ def make_pysrc_wheel(
     if not os.path.exists(dist):
         os.mkdir(dist)
 
+    # Apply patch to remove NCCL dependency
+    # Save the original content of pyproject.toml so that we can restore it later
+    with DirectoryExcursion(ROOT):
+        with open("python-package/pyproject.toml", "r") as f:
+            orig_pyproj_lines = f.read()
+        with open("tests/buildkite/remove_nccl_dep.patch", "r") as f:
+            patch_lines = f.read()
+        subprocess.run(["patch", "-p0"], input=patch_lines, text=True)
+
     with DirectoryExcursion(os.path.join(ROOT, "python-package")):
         subprocess.check_call(["python", "-m", "build", "--sdist"])
         if rc is not None:
@@ -117,6 +127,10 @@ def make_pysrc_wheel(
         target = os.path.join(dist, name)
         shutil.move(src, target)
 
+    with DirectoryExcursion(ROOT):
+        with open("python-package/pyproject.toml", "w") as f:
+            print(orig_pyproj_lines, file=f, end="")
+
 
 def download_py_packages(
     branch: str, major: int, minor: int, commit_hash: str, outdir: str
diff --git a/doc/install.rst b/doc/install.rst
index e5229702e16c..79082a7ed581 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -76,6 +76,19 @@ Capabilities of binary wheels for each platform:
 | Windows             | |tick|  |  |cross|             |
 +---------------------+---------+----------------------+
 
+Minimal installation (CPU-only)
+*******************************
+The default installation with ``pip`` will install the full XGBoost package, including the support for the GPU algorithms and federated learning.
+
+You may choose to reduce the size of the installed package and save the disk space, by opting to install ``xgboost-cpu`` instead:
+
+.. code-block:: bash
+
+  pip install xgboost-cpu
+
+The ``xgboost-cpu`` variant will have drastically smaller disk footprint, but does not provide some features, such as the GPU algorithms and
+federated learning.
+
 Conda
 *****
 
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
index a273d8c135d8..8835def25858 100644
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -7,13 +7,13 @@ build-backend = "packager.pep517"
 
 [project]
 name = "xgboost"
-version = "2.2.0-dev"
+description = "XGBoost Python Package"
+readme = { file = "README.rst", content-type = "text/x-rst" }
 authors = [
     { name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
     { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
 ]
-description = "XGBoost Python Package"
-readme = { file = "README.rst", content-type = "text/x-rst" }
+version = "2.2.0-dev"
 requires-python = ">=3.8"
 license = { text = "Apache-2.0" }
 classifiers = [
diff --git a/tests/buildkite/build-manylinux2014-aarch64.sh b/tests/buildkite/build-manylinux2014-aarch64.sh
deleted file mode 100644
index 802db3f66aaf..000000000000
--- a/tests/buildkite/build-manylinux2014-aarch64.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-set -euo pipefail
-
-source tests/buildkite/conftest.sh
-
-WHEEL_TAG=manylinux2014_aarch64
-command_wrapper="tests/ci_build/ci_build.sh manylinux2014_aarch64"
-python_bin="/opt/python/cp310-cp310/bin/python"
-
-echo "--- Build binary wheel for ${WHEEL_TAG}"
-# Patch to add warning about manylinux2014 variant
-patch -p0 < tests/buildkite/manylinux2014_warning.patch
-$command_wrapper bash -c \
-  "cd python-package && ${python_bin} -m pip wheel --no-deps -vvv . --wheel-dir dist/"
-git checkout python-package/xgboost/core.py  # discard the patch
-
-$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl
-$command_wrapper ${python_bin} tests/ci_build/rename_whl.py  \
-  --wheel-path wheelhouse/*.whl  \
-  --commit-hash ${BUILDKITE_COMMIT}  \
-  --platform-tag ${WHEEL_TAG}
-rm -rf python-package/dist/
-mkdir python-package/dist/
-mv -v wheelhouse/*.whl python-package/dist/
-
-echo "--- Upload Python wheel"
-buildkite-agent artifact upload python-package/dist/*.whl
-if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
-then
-  aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \
-    --acl public-read --no-progress
-fi
diff --git a/tests/buildkite/build-manylinux2014-x86_64.sh b/tests/buildkite/build-manylinux2014-x86_64.sh
deleted file mode 100644
index b00616315b8d..000000000000
--- a/tests/buildkite/build-manylinux2014-x86_64.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-set -euo pipefail
-
-source tests/buildkite/conftest.sh
-
-WHEEL_TAG=manylinux2014_x86_64
-command_wrapper="tests/ci_build/ci_build.sh manylinux2014_x86_64"
-python_bin="/opt/python/cp310-cp310/bin/python"
-
-echo "--- Build binary wheel for ${WHEEL_TAG}"
-# Patch to add warning about manylinux2014 variant
-patch -p0 < tests/buildkite/manylinux2014_warning.patch
-$command_wrapper bash -c \
-  "cd python-package && ${python_bin} -m pip wheel --no-deps -vvv . --wheel-dir dist/"
-git checkout python-package/xgboost/core.py  # discard the patch
-
-$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl
-$command_wrapper ${python_bin} tests/ci_build/rename_whl.py  \
-  --wheel-path wheelhouse/*.whl  \
-  --commit-hash ${BUILDKITE_COMMIT}  \
-  --platform-tag ${WHEEL_TAG}
-rm -rf python-package/dist/
-mkdir python-package/dist/
-mv -v wheelhouse/*.whl python-package/dist/
-
-echo "--- Upload Python wheel"
-buildkite-agent artifact upload python-package/dist/*.whl
-if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
-then
-  aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \
-    --acl public-read --no-progress
-fi
diff --git a/tests/buildkite/build-manylinux2014.sh b/tests/buildkite/build-manylinux2014.sh
new file mode 100755
index 000000000000..426d32b5c361
--- /dev/null
+++ b/tests/buildkite/build-manylinux2014.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+set -euo pipefail
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 {x86_64,aarch64}"
+  exit 1
+fi
+
+arch=$1
+
+source tests/buildkite/conftest.sh
+
+WHEEL_TAG="manylinux2014_${arch}"
+command_wrapper="tests/ci_build/ci_build.sh ${WHEEL_TAG}"
+python_bin="/opt/python/cp310-cp310/bin/python"
+
+echo "--- Build binary wheel for ${WHEEL_TAG}"
+# Patch to add warning about manylinux2014 variant
+patch -p0 < tests/buildkite/remove_nccl_dep.patch
+patch -p0 < tests/buildkite/manylinux2014_warning.patch
+$command_wrapper bash -c \
+  "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/"
+git checkout python-package/pyproject.toml python-package/xgboost/core.py  # discard the patch
+
+$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl
+$command_wrapper ${python_bin} tests/ci_build/rename_whl.py  \
+  --wheel-path wheelhouse/*.whl  \
+  --commit-hash ${BUILDKITE_COMMIT}  \
+  --platform-tag ${WHEEL_TAG}
+rm -rf python-package/dist/
+mkdir python-package/dist/
+mv -v wheelhouse/*.whl python-package/dist/
+
+echo "--- Build binary wheel for ${WHEEL_TAG} (CPU only)"
+# Patch to rename pkg to xgboost-cpu
+patch -p0 < tests/buildkite/remove_nccl_dep.patch
+patch -p0 < tests/buildkite/cpu_only_pypkg.patch
+$command_wrapper bash -c \
+  "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/"
+git checkout python-package/pyproject.toml  # discard the patch
+
+$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/xgboost_cpu-*.whl
+$command_wrapper ${python_bin} tests/ci_build/rename_whl.py  \
+  --wheel-path wheelhouse/xgboost_cpu-*.whl  \
+  --commit-hash ${BUILDKITE_COMMIT}  \
+  --platform-tag ${WHEEL_TAG}
+rm -v python-package/dist/xgboost_cpu-*.whl
+mv -v wheelhouse/xgboost_cpu-*.whl python-package/dist/
+
+echo "--- Upload Python wheel"
+for wheel in python-package/dist/*.whl
+do
+  buildkite-agent artifact upload "${wheel}"
+done
+if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
+then
+  for wheel in python-package/dist/*.whl
+  do
+    aws s3 cp "${wheel}" s3://xgboost-nightly-builds/${BRANCH_NAME}/ \
+      --acl public-read --no-progress
+  done
+fi
diff --git a/tests/buildkite/cpu_only_pypkg.patch b/tests/buildkite/cpu_only_pypkg.patch
new file mode 100644
index 000000000000..765ac5c098d0
--- /dev/null
+++ b/tests/buildkite/cpu_only_pypkg.patch
@@ -0,0 +1,55 @@
+diff --git python-package/README.rst python-package/README.rst
+index 1fc0bb5a0..f1c68470b 100644
+--- python-package/README.rst
++++ python-package/README.rst
+@@ -1,20 +1,15 @@
+-======================
+-XGBoost Python Package
+-======================
++=================================
++XGBoost Python Package (CPU only)
++=================================
+ 
+ |PyPI version|
+ 
+-Installation
+-============
++The ``xgboost-cpu`` package provides for a minimal installation, with no support for the GPU algorithms
++or federated learning. It is provided to allow XGBoost to be installed in a space-constrained
++environments.
+ 
+-From `PyPI <https://pypi.python.org/pypi/xgboost>`_
+----------------------------------------------------
++Note. ``xgboost-cpu`` package is only provided for x86_64 (amd64) Linux and Windows platforms.
++For other platforms, please install ``xgboost`` from https://pypi.org/project/xgboost/.
+ 
+-For a stable version, install using ``pip``::
+-
+-    pip install xgboost
+-
+-.. |PyPI version| image:: https://badge.fury.io/py/xgboost.svg
+-   :target: http://badge.fury.io/py/xgboost
+-
+-For building from source, see `build <https://xgboost.readthedocs.io/en/latest/build.html>`_.
++Note. ``xgboost-cpu`` does not provide an sdist (source distribution). You may install sdist
++from https://pypi.org/project/xgboost/.
+diff --git python-package/pyproject.toml python-package/pyproject.toml
+index 46c1451c2..c5dc908d9 100644
+--- python-package/pyproject.toml
++++ python-package/pyproject.toml
+@@ -6,7 +6,7 @@ backend-path = ["."]
+ build-backend = "packager.pep517"
+ 
+ [project]
+-name = "xgboost"
++name = "xgboost-cpu"
+ description = "XGBoost Python Package"
+ readme = { file = "README.rst", content-type = "text/x-rst" }
+ authors = [
+@@ -82,3 +82,6 @@ class-attribute-naming-style = "snake_case"
+ 
+ # Allow single-letter variables
+ variable-rgx = "[a-zA-Z_][a-z0-9_]{0,30}$"
++
++[tool.hatch.build.targets.wheel]
++packages = ["xgboost/"]
diff --git a/tests/buildkite/manylinux2014_warning.patch b/tests/buildkite/manylinux2014_warning.patch
index 692a92672d2f..679205988b7a 100644
--- a/tests/buildkite/manylinux2014_warning.patch
+++ b/tests/buildkite/manylinux2014_warning.patch
@@ -1,17 +1,3 @@
-diff --git python-package/pyproject.toml python-package/pyproject.toml
-index a273d8c13..dee49686a 100644
---- python-package/pyproject.toml
-+++ python-package/pyproject.toml
-@@ -30,8 +30,7 @@ classifiers = [
- ]
- dependencies = [
-     "numpy",
--    "scipy",
--    "nvidia-nccl-cu12 ; platform_system == 'Linux' and platform_machine != 'aarch64'"
-+    "scipy"
- ]
- 
- [project.urls]
 diff --git python-package/xgboost/core.py python-package/xgboost/core.py
 index e8bc735e6..030972ef2 100644
 --- python-package/xgboost/core.py
diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml
index acdb71dba529..ee9637b8bd25 100644
--- a/tests/buildkite/pipeline.yml
+++ b/tests/buildkite/pipeline.yml
@@ -72,12 +72,12 @@ steps:
     agents:
       queue: linux-amd64-cpu
   - label: ":console: Build manylinux2014_x86_64 wheel"
-    command: "tests/buildkite/build-manylinux2014-x86_64.sh"
+    command: "tests/buildkite/build-manylinux2014.sh x86_64"
     key: build-manylinux2014-x86_64
     agents:
       queue: linux-amd64-cpu
   - label: ":console: Build manylinux2014_aarch64 wheel"
-    command: "tests/buildkite/build-manylinux2014-aarch64.sh"
+    command: "tests/buildkite/build-manylinux2014.sh aarch64"
     key: build-manylinux2014-aarch64
     agents:
       queue: linux-arm64-cpu
diff --git a/tests/buildkite/remove_nccl_dep.patch b/tests/buildkite/remove_nccl_dep.patch
new file mode 100644
index 000000000000..a2a4a5c88289
--- /dev/null
+++ b/tests/buildkite/remove_nccl_dep.patch
@@ -0,0 +1,14 @@
+diff --git python-package/pyproject.toml python-package/pyproject.toml
+index 8835def25..46c1451c2 100644
+--- python-package/pyproject.toml
++++ python-package/pyproject.toml
+@@ -30,8 +30,7 @@ classifiers = [
+ ]
+ dependencies = [
+     "numpy",
+-    "scipy",
+-    "nvidia-nccl-cu12 ; platform_system == 'Linux' and platform_machine != 'aarch64'"
++    "scipy"
+ ]
+ 
+ [project.urls]

From 344ddeb9ca8fbf5e546250c3e4c52c012a977ee3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 20 Jul 2024 06:14:56 +0800
Subject: [PATCH 46/51] Drop support for CUDA legacy stream. (#10607)

---
 CMakeLists.txt    | 1 -
 cmake/Utils.cmake | 8 ++------
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f7cf8a6cfa87..034d52164ad2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,7 +71,6 @@ option(HIDE_CXX_SYMBOLS "Build shared library and hide all C++ symbols" OFF)
 option(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR "Output build artifacts in CMake binary dir" OFF)
 ## CUDA
 option(USE_CUDA  "Build with GPU acceleration" OFF)
-option(USE_PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" ON)
 option(USE_NCCL  "Build with NCCL to enable distributed GPU support." OFF)
 # This is specifically designed for PyPI binary release and should be disabled for most of the cases.
 option(USE_DLOPEN_NCCL "Whether to load nccl dynamically." OFF)
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 9563ec3076b2..266cf29b38b2 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -80,12 +80,8 @@ function(xgboost_set_cuda_flags target)
     $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda>
     $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
     $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=${OpenMP_CXX_FLAGS}>
-    $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all>)
-
-  if(USE_PER_THREAD_DEFAULT_STREAM)
-    target_compile_options(${target} PRIVATE
-            $<$<COMPILE_LANGUAGE:CUDA>:--default-stream per-thread>)
-  endif()
+    $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all>
+    $<$<COMPILE_LANGUAGE:CUDA>:--default-stream per-thread>)
 
   if(FORCE_COLORED_OUTPUT)
     if(FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND

From 0846ad860cec25d536f82a8448e2ac0eff487846 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 20 Jul 2024 22:12:12 +0800
Subject: [PATCH 47/51] Optionally skip cupy on windows. (#10611)

---
 python-package/xgboost/testing/__init__.py | 41 ++++++----------------
 python-package/xgboost/testing/data.py     | 31 ++++++++++++++++
 2 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index e0096c89c9a8..b934e99e7981 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -45,6 +45,7 @@
     get_cancer,
     get_digits,
     get_sparse,
+    make_batches,
     memory,
 )
 
@@ -161,7 +162,16 @@ def no_cudf() -> PytestSkip:
 
 
 def no_cupy() -> PytestSkip:
-    return no_mod("cupy")
+    skip_cupy = no_mod("cupy")
+    if not skip_cupy["condition"] and system() == "Windows":
+        import cupy as cp
+
+        # Cupy might run into issue on Windows due to missing compiler
+        try:
+            cp.array([1, 2, 3]).sum()
+        except Exception:  # pylint: disable=broad-except
+            skip_cupy["condition"] = True
+    return skip_cupy
 
 
 def no_dask_cudf() -> PytestSkip:
@@ -248,35 +258,6 @@ def as_arrays(
         return X, y, w
 
 
-def make_batches(  # pylint: disable=too-many-arguments,too-many-locals
-    n_samples_per_batch: int,
-    n_features: int,
-    n_batches: int,
-    use_cupy: bool = False,
-    *,
-    vary_size: bool = False,
-    random_state: int = 1994,
-) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
-    X = []
-    y = []
-    w = []
-    if use_cupy:
-        import cupy
-
-        rng = cupy.random.RandomState(random_state)
-    else:
-        rng = np.random.RandomState(random_state)
-    for i in range(n_batches):
-        n_samples = n_samples_per_batch + i * 10 if vary_size else n_samples_per_batch
-        _X = rng.randn(n_samples, n_features)
-        _y = rng.randn(n_samples)
-        _w = rng.uniform(low=0, high=1, size=n_samples)
-        X.append(_X)
-        y.append(_y)
-        w.append(_w)
-    return X, y, w
-
-
 def make_regression(
     n_samples: int, n_features: int, use_cupy: bool
 ) -> Tuple[ArrayLike, ArrayLike, ArrayLike]:
diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py
index f4e97e59d363..4071219c44ef 100644
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@@ -9,6 +9,7 @@
     Callable,
     Dict,
     Generator,
+    List,
     NamedTuple,
     Optional,
     Tuple,
@@ -506,6 +507,36 @@ def get_mq2008(
     )
 
 
+def make_batches(  # pylint: disable=too-many-arguments,too-many-locals
+    n_samples_per_batch: int,
+    n_features: int,
+    n_batches: int,
+    use_cupy: bool = False,
+    *,
+    vary_size: bool = False,
+    random_state: int = 1994,
+) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
+    """Make batches of dense data."""
+    X = []
+    y = []
+    w = []
+    if use_cupy:
+        import cupy  # pylint: disable=import-error
+
+        rng = cupy.random.RandomState(random_state)
+    else:
+        rng = np.random.RandomState(random_state)
+    for i in range(n_batches):
+        n_samples = n_samples_per_batch + i * 10 if vary_size else n_samples_per_batch
+        _X = rng.randn(n_samples, n_features)
+        _y = rng.randn(n_samples)
+        _w = rng.uniform(low=0, high=1, size=n_samples)
+        X.append(_X)
+        y.append(_y)
+        w.append(_w)
+    return X, y, w
+
+
 RelData = Tuple[sparse.csr_matrix, npt.NDArray[np.int32], npt.NDArray[np.int32]]
 
 

From cb62f9e73bbdf0aafa414938e5b692e70525f0ee Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sun, 21 Jul 2024 05:08:29 +0800
Subject: [PATCH 48/51] [EM] Prevent init with CUDA malloc resource. (#10606)

---
 src/common/cuda_rt_utils.h                  | 37 +++++++++++++
 src/common/ref_resource_view.cuh            | 11 +++-
 src/common/resource.cuh                     |  6 +--
 src/common/timer.cc                         | 29 +++++-----
 src/data/ellpack_page.cu                    |  2 +-
 src/data/ellpack_page_raw_format.cu         | 60 ++++++++++++++-------
 tests/ci_build/conda_env/macos_cpu_test.yml |  1 +
 7 files changed, 105 insertions(+), 41 deletions(-)

diff --git a/src/common/cuda_rt_utils.h b/src/common/cuda_rt_utils.h
index fa14f8434970..210f1e07d7f8 100644
--- a/src/common/cuda_rt_utils.h
+++ b/src/common/cuda_rt_utils.h
@@ -3,6 +3,11 @@
  */
 #pragma once
 #include <cstdint>  // for int32_t
+
+#if defined(XGBOOST_USE_NVTX)
+#include <nvtx3/nvtx3.hpp>
+#endif  // defined(XGBOOST_USE_NVTX)
+
 namespace xgboost::common {
 std::int32_t AllVisibleGPUs();
 
@@ -18,4 +23,36 @@ bool SupportsAts();
 void CheckComputeCapability();
 
 void SetDevice(std::int32_t device);
+
+struct NvtxDomain {
+  static constexpr char const *name{"libxgboost"};  // NOLINT
+};
+
+#if defined(XGBOOST_USE_NVTX)
+using NvtxScopedRange = ::nvtx3::scoped_range_in<NvtxDomain>;
+using NvtxEventAttr = ::nvtx3::event_attributes;
+using NvtxRgb = ::nvtx3::rgb;
+#else
+class NvtxScopedRange {
+ public:
+  template <typename... Args>
+  explicit NvtxScopedRange(Args &&...) {}
+};
+class NvtxEventAttr {
+ public:
+  template <typename... Args>
+  explicit NvtxEventAttr(Args &&...) {}
+};
+class NvtxRgb {
+ public:
+  template <typename... Args>
+  explicit NvtxRgb(Args &&...) {}
+};
+#endif  // defined(XGBOOST_USE_NVTX)
 }  // namespace xgboost::common
+
+#if defined(XGBOOST_USE_NVTX)
+#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::common::NvtxDomain)
+#else
+#define xgboost_NVTX_FN_RANGE()
+#endif  // defined(XGBOOST_USE_NVTX)
diff --git a/src/common/ref_resource_view.cuh b/src/common/ref_resource_view.cuh
index ff311c1409a7..d48b221a305d 100644
--- a/src/common/ref_resource_view.cuh
+++ b/src/common/ref_resource_view.cuh
@@ -16,10 +16,17 @@ namespace xgboost::common {
  * @brief Make a fixed size `RefResourceView` with cudaMalloc resource.
  */
 template <typename T>
-[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx,
-                                                            std::size_t n_elements, T const& init) {
+[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const*,
+                                                            std::size_t n_elements) {
   auto resource = std::make_shared<common::CudaMallocResource>(n_elements * sizeof(T));
   auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
+  return ref;
+}
+
+template <typename T>
+[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx,
+                                                            std::size_t n_elements, T const& init) {
+  auto ref = MakeFixedVecWithCudaMalloc<T>(ctx, n_elements);
   thrust::fill_n(ctx->CUDACtx()->CTP(), ref.data(), ref.size(), init);
   return ref;
 }
diff --git a/src/common/resource.cuh b/src/common/resource.cuh
index 90b9756a9fc2..e950a8d90695 100644
--- a/src/common/resource.cuh
+++ b/src/common/resource.cuh
@@ -24,11 +24,9 @@ class CudaMallocResource : public ResourceHandler {
   }
   ~CudaMallocResource() noexcept(true) override { this->Clear(); }
 
-  void* Data() override { return storage_.data(); }
+  [[nodiscard]] void* Data() override { return storage_.data(); }
   [[nodiscard]] std::size_t Size() const override { return storage_.size(); }
-  void Resize(std::size_t n_bytes, std::byte init = std::byte{0}) {
-    this->storage_.resize(n_bytes, init);
-  }
+  void Resize(std::size_t n_bytes) { this->storage_.resize(n_bytes); }
 };
 
 class CudaMmapResource : public ResourceHandler {
diff --git a/src/common/timer.cc b/src/common/timer.cc
index 9b1f49fbd5c8..0b55d1623dbc 100644
--- a/src/common/timer.cc
+++ b/src/common/timer.cc
@@ -6,9 +6,10 @@
 #include <utility>
 
 #include "../collective/communicator-inl.h"
+#include "cuda_rt_utils.h"
 
 #if defined(XGBOOST_USE_NVTX)
-#include <nvtx3/nvToolsExt.h>
+#include <nvtx3/nvtx3.hpp>
 #endif  // defined(XGBOOST_USE_NVTX)
 
 namespace xgboost::common {
@@ -17,8 +18,8 @@ void Monitor::Start(std::string const &name) {
     auto &stats = statistics_map_[name];
     stats.timer.Start();
 #if defined(XGBOOST_USE_NVTX)
-    std::string nvtx_name = "xgboost::" + label_ + "::" + name;
-    stats.nvtx_id = nvtxRangeStartA(nvtx_name.c_str());
+    auto range_handle = nvtx3::start_range_in<common::NvtxDomain>(label_ + "::" + name);
+    stats.nvtx_id = range_handle.get_value();
 #endif  // defined(XGBOOST_USE_NVTX)
   }
 }
@@ -29,34 +30,32 @@ void Monitor::Stop(const std::string &name) {
     stats.timer.Stop();
     stats.count++;
 #if defined(XGBOOST_USE_NVTX)
-    nvtxRangeEnd(stats.nvtx_id);
+    nvtx3::end_range_in<common::NvtxDomain>(nvtx3::range_handle{stats.nvtx_id});
 #endif  // defined(XGBOOST_USE_NVTX)
   }
 }
 
-void Monitor::PrintStatistics(StatMap const& statistics) const {
+void Monitor::PrintStatistics(StatMap const &statistics) const {
   for (auto &kv : statistics) {
     if (kv.second.first == 0) {
-      LOG(WARNING) <<
-          "Timer for " << kv.first << " did not get stopped properly.";
+      LOG(WARNING) << "Timer for " << kv.first << " did not get stopped properly.";
       continue;
     }
-    LOG(CONSOLE) << kv.first << ": " << static_cast<double>(kv.second.second) / 1e+6
-                 << "s, " << kv.second.first << " calls @ "
-                 << kv.second.second
-                 << "us" << std::endl;
+    LOG(CONSOLE) << kv.first << ": " << static_cast<double>(kv.second.second) / 1e+6 << "s, "
+                 << kv.second.first << " calls @ " << kv.second.second << "us" << std::endl;
   }
 }
 
 void Monitor::Print() const {
-  if (!ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) { return; }
+  if (!ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
+    return;
+  }
   auto rank = collective::GetRank();
   StatMap stat_map;
   for (auto const &kv : statistics_map_) {
     stat_map[kv.first] = std::make_pair(
-        kv.second.count, std::chrono::duration_cast<std::chrono::microseconds>(
-                             kv.second.timer.elapsed)
-                             .count());
+        kv.second.count,
+        std::chrono::duration_cast<std::chrono::microseconds>(kv.second.timer.elapsed).count());
   }
   if (stat_map.empty()) {
     return;
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 7d3f4c820a22..fc28b7c56f12 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -404,7 +404,7 @@ size_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page, bs
     LOG(FATAL) << "Concatenating the same Ellpack.";
     return this->n_rows * this->row_stride;
   }
-  dh::LaunchN(num_elements, CopyPage{this, page, offset});
+  dh::LaunchN(num_elements, ctx->CUDACtx()->Stream(), CopyPage{this, page, offset});
   monitor_.Stop(__func__);
   return num_elements;
 }
diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
index 3f23c5d8d3d6..86d1ac6da7eb 100644
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -6,6 +6,7 @@
 #include <cstddef>  // for size_t
 #include <vector>   // for vector
 
+#include "../common/cuda_rt_utils.h"
 #include "../common/io.h"                   // for AlignedResourceReadStream, AlignedFileWriteStream
 #include "../common/ref_resource_view.cuh"  // for MakeFixedVecWithCudaMalloc
 #include "../common/ref_resource_view.h"    // for ReadVec, WriteVec
@@ -21,6 +22,8 @@ namespace {
 template <typename T>
 [[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi,
                                  common::RefResourceView<T>* vec) {
+  xgboost_NVTX_FN_RANGE();
+
   std::uint64_t n{0};
   if (!fi->Read(&n)) {
     return false;
@@ -37,7 +40,7 @@ template <typename T>
   }
 
   auto ctx = Context{}.MakeCUDA(common::CurrentDevice());
-  *vec = common::MakeFixedVecWithCudaMalloc(&ctx, n, static_cast<T>(0));
+  *vec = common::MakeFixedVecWithCudaMalloc<T>(&ctx, n);
   dh::safe_cuda(cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream()));
   return true;
 }
@@ -50,6 +53,7 @@ template <typename T>
 
 [[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page,
                                               common::AlignedResourceReadStream* fi) {
+  xgboost_NVTX_FN_RANGE();
   auto* impl = page->Impl();
 
   impl->SetCuts(this->cuts_);
@@ -69,6 +73,8 @@ template <typename T>
 
 [[nodiscard]] std::size_t EllpackPageRawFormat::Write(const EllpackPage& page,
                                                       common::AlignedFileWriteStream* fo) {
+  xgboost_NVTX_FN_RANGE();
+
   std::size_t bytes{0};
   auto* impl = page.Impl();
   bytes += fo->Write(impl->n_rows);
@@ -84,22 +90,30 @@ template <typename T>
 }
 
 [[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page, EllpackHostCacheStream* fi) const {
+  xgboost_NVTX_FN_RANGE();
+
   auto* impl = page->Impl();
   CHECK(this->cuts_->cut_values_.DeviceCanRead());
   impl->SetCuts(this->cuts_);
-  RET_IF_NOT(fi->Read(&impl->n_rows));
-  RET_IF_NOT(fi->Read(&impl->is_dense));
-  RET_IF_NOT(fi->Read(&impl->row_stride));
 
-  // Read vec
+  // Read vector
   Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
-  bst_idx_t n{0};
-  RET_IF_NOT(fi->Read(&n));
-  if (n != 0) {
-    impl->gidx_buffer =
-        common::MakeFixedVecWithCudaMalloc(&ctx, n, static_cast<common::CompressedByteT>(0));
+  auto read_vec = [&] {
+    common::NvtxScopedRange range{common::NvtxEventAttr{"read-vec", common::NvtxRgb{127, 255, 0}}};
+    bst_idx_t n{0};
+    RET_IF_NOT(fi->Read(&n));
+    if (n == 0) {
+      return true;
+    }
+    impl->gidx_buffer = common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(&ctx, n);
     RET_IF_NOT(fi->Read(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes()));
-  }
+    return true;
+  };
+  RET_IF_NOT(read_vec());
+
+  RET_IF_NOT(fi->Read(&impl->n_rows));
+  RET_IF_NOT(fi->Read(&impl->is_dense));
+  RET_IF_NOT(fi->Read(&impl->row_stride));
   RET_IF_NOT(fi->Read(&impl->base_rowid));
 
   dh::DefaultStream().Sync();
@@ -108,19 +122,27 @@ template <typename T>
 
 [[nodiscard]] std::size_t EllpackPageRawFormat::Write(const EllpackPage& page,
                                                       EllpackHostCacheStream* fo) const {
+  xgboost_NVTX_FN_RANGE();
+
   bst_idx_t bytes{0};
   auto* impl = page.Impl();
-  bytes += fo->Write(impl->n_rows);
-  bytes += fo->Write(impl->is_dense);
-  bytes += fo->Write(impl->row_stride);
 
   // Write vector
-  bst_idx_t n = impl->gidx_buffer.size();
-  bytes += fo->Write(n);
+  auto write_vec = [&] {
+    common::NvtxScopedRange range{common::NvtxEventAttr{"write-vec", common::NvtxRgb{127, 255, 0}}};
+    bst_idx_t n = impl->gidx_buffer.size();
+    bytes += fo->Write(n);
 
-  if (!impl->gidx_buffer.empty()) {
-    bytes += fo->Write(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes());
-  }
+    if (!impl->gidx_buffer.empty()) {
+      bytes += fo->Write(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes());
+    }
+  };
+
+  write_vec();
+
+  bytes += fo->Write(impl->n_rows);
+  bytes += fo->Write(impl->is_dense);
+  bytes += fo->Write(impl->row_stride);
   bytes += fo->Write(impl->base_rowid);
 
   dh::DefaultStream().Sync();
diff --git a/tests/ci_build/conda_env/macos_cpu_test.yml b/tests/ci_build/conda_env/macos_cpu_test.yml
index e2e377e2145d..5bca323af5f4 100644
--- a/tests/ci_build/conda_env/macos_cpu_test.yml
+++ b/tests/ci_build/conda_env/macos_cpu_test.yml
@@ -37,4 +37,5 @@ dependencies:
 - pyspark>=3.4.0
 - cloudpickle
 - pip:
+  - setuptools
   - sphinx_rtd_theme

From 6d9fcb771e1988d04c511c95b77350b05f71427c Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sun, 21 Jul 2024 14:10:13 +0800
Subject: [PATCH 49/51] Move device histogram storage into `histogram.cuh`.
 (#10608)

---
 src/tree/gpu_hist/histogram.cuh           | 119 ++++++++++++++++++++--
 src/tree/updater_gpu_hist.cu              | 113 +-------------------
 tests/cpp/helpers.cc                      |   2 +
 tests/cpp/helpers.h                       |   3 +
 tests/cpp/tree/gpu_hist/test_histogram.cu |  40 ++++++++
 tests/cpp/tree/test_gpu_hist.cu           |  61 ++---------
 6 files changed, 171 insertions(+), 167 deletions(-)

diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
index 862821b00b63..87c60a8bfdbc 100644
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -5,12 +5,14 @@
 #define HISTOGRAM_CUH_
 #include <memory>  // for unique_ptr
 
-#include "../../common/cuda_context.cuh"  // for CUDAContext
-#include "../../data/ellpack_page.cuh"    // for EllpackDeviceAccessor
-#include "feature_groups.cuh"             // for FeatureGroupsAccessor
-#include "xgboost/base.h"                 // for GradientPair, GradientPairInt64
-#include "xgboost/context.h"              // for Context
-#include "xgboost/span.h"                 // for Span
+#include "../../common/cuda_context.cuh"    // for CUDAContext
+#include "../../common/device_helpers.cuh"  // for LaunchN
+#include "../../common/device_vector.cuh"   // for device_vector
+#include "../../data/ellpack_page.cuh"      // for EllpackDeviceAccessor
+#include "feature_groups.cuh"               // for FeatureGroupsAccessor
+#include "xgboost/base.h"                   // for GradientPair, GradientPairInt64
+#include "xgboost/context.h"                // for Context
+#include "xgboost/span.h"                   // for Span
 
 namespace xgboost::tree {
 /**
@@ -60,6 +62,111 @@ class GradientQuantiser {
   }
 };
 
+/**
+ * @brief Data storage for node histograms on device. Automatically expands.
+ *
+ * @tparam kStopGrowingSize  Do not grow beyond this size
+ *
+ * @author  Rory
+ * @date    28/07/2018
+ */
+template <size_t kStopGrowingSize = 1 << 28>
+class DeviceHistogramStorage {
+ private:
+  using GradientSumT = GradientPairInt64;
+  /** @brief Map nidx to starting index of its histogram. */
+  std::map<int, size_t> nidx_map_;
+  // Large buffer of zeroed memory, caches histograms
+  dh::device_vector<typename GradientSumT::ValueT> data_;
+  // If we run out of storage allocate one histogram at a time
+  // in overflow. Not cached, overwritten when a new histogram
+  // is requested
+  dh::device_vector<typename GradientSumT::ValueT> overflow_;
+  std::map<int, size_t> overflow_nidx_map_;
+  int n_bins_;
+  DeviceOrd device_id_;
+  static constexpr size_t kNumItemsInGradientSum =
+      sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
+  static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
+
+ public:
+  // Start with about 16mb
+  DeviceHistogramStorage() { data_.reserve(1 << 22); }
+  void Init(DeviceOrd device_id, int n_bins) {
+    this->n_bins_ = n_bins;
+    this->device_id_ = device_id;
+  }
+
+  void Reset(Context const* ctx) {
+    auto d_data = data_.data().get();
+    dh::LaunchN(data_.size(), ctx->CUDACtx()->Stream(),
+                [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
+    nidx_map_.clear();
+    overflow_nidx_map_.clear();
+  }
+  [[nodiscard]] bool HistogramExists(int nidx) const {
+    return nidx_map_.find(nidx) != nidx_map_.cend() ||
+           overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
+  }
+  [[nodiscard]] int Bins() const { return n_bins_; }
+  [[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
+  dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
+
+  void AllocateHistograms(Context const* ctx, const std::vector<int>& new_nidxs) {
+    for (int nidx : new_nidxs) {
+      CHECK(!HistogramExists(nidx));
+    }
+    // Number of items currently used in data
+    const size_t used_size = nidx_map_.size() * HistogramSize();
+    const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
+    if (used_size >= kStopGrowingSize) {
+      // Use overflow
+      // Delete previous entries
+      overflow_nidx_map_.clear();
+      overflow_.resize(HistogramSize() * new_nidxs.size());
+      // Zero memory
+      auto d_data = overflow_.data().get();
+      dh::LaunchN(overflow_.size(), ctx->CUDACtx()->Stream(),
+                  [=] __device__(size_t idx) { d_data[idx] = 0.0; });
+      // Append new histograms
+      for (int nidx : new_nidxs) {
+        overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize();
+      }
+    } else {
+      CHECK_GE(data_.size(), used_size);
+      // Expand if necessary
+      if (data_.size() < new_used_size) {
+        data_.resize(std::max(data_.size() * 2, new_used_size));
+      }
+      // Append new histograms
+      for (int nidx : new_nidxs) {
+        nidx_map_[nidx] = nidx_map_.size() * HistogramSize();
+      }
+    }
+
+    CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize());
+  }
+
+  /**
+   * \summary   Return pointer to histogram memory for a given node.
+   * \param nidx    Tree node index.
+   * \return    hist pointer.
+   */
+  common::Span<GradientSumT> GetNodeHistogram(int nidx) {
+    CHECK(this->HistogramExists(nidx));
+
+    if (nidx_map_.find(nidx) != nidx_map_.cend()) {
+      // Fetch from normal cache
+      auto ptr = data_.data().get() + nidx_map_.at(nidx);
+      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
+    } else {
+      // Fetch from overflow
+      auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
+      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
+    }
+  }
+};
+
 class DeviceHistogramBuilderImpl;
 
 class DeviceHistogramBuilder {
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 7d566c3b40ae..83f84ec1f4a5 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -49,113 +49,6 @@ namespace xgboost::tree {
 DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
 #endif  // !defined(GTEST_TEST)
 
-/**
- * \struct  DeviceHistogramStorage
- *
- * \summary Data storage for node histograms on device. Automatically expands.
- *
- * \tparam GradientSumT      histogram entry type.
- * \tparam kStopGrowingSize  Do not grow beyond this size
- *
- * \author  Rory
- * \date    28/07/2018
- */
-template <size_t kStopGrowingSize = 1 << 28>
-class DeviceHistogramStorage {
- private:
-  using GradientSumT = GradientPairInt64;
-  /*! \brief Map nidx to starting index of its histogram. */
-  std::map<int, size_t> nidx_map_;
-  // Large buffer of zeroed memory, caches histograms
-  dh::device_vector<typename GradientSumT::ValueT> data_;
-  // If we run out of storage allocate one histogram at a time
-  // in overflow. Not cached, overwritten when a new histogram
-  // is requested
-  dh::device_vector<typename GradientSumT::ValueT> overflow_;
-  std::map<int, size_t> overflow_nidx_map_;
-  int n_bins_;
-  DeviceOrd device_id_;
-  static constexpr size_t kNumItemsInGradientSum =
-      sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
-  static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
-
- public:
-  // Start with about 16mb
-  DeviceHistogramStorage() { data_.reserve(1 << 22); }
-  void Init(DeviceOrd device_id, int n_bins) {
-    this->n_bins_ = n_bins;
-    this->device_id_ = device_id;
-  }
-
-  void Reset() {
-    auto d_data = data_.data().get();
-    dh::LaunchN(data_.size(), [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
-    nidx_map_.clear();
-    overflow_nidx_map_.clear();
-  }
-  [[nodiscard]] bool HistogramExists(int nidx) const {
-    return nidx_map_.find(nidx) != nidx_map_.cend() ||
-           overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
-  }
-  [[nodiscard]] int Bins() const { return n_bins_; }
-  [[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
-  dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
-
-  void AllocateHistograms(const std::vector<int>& new_nidxs) {
-    for (int nidx : new_nidxs) {
-      CHECK(!HistogramExists(nidx));
-    }
-    // Number of items currently used in data
-    const size_t used_size = nidx_map_.size() * HistogramSize();
-    const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
-    if (used_size >= kStopGrowingSize) {
-      // Use overflow
-      // Delete previous entries
-      overflow_nidx_map_.clear();
-      overflow_.resize(HistogramSize() * new_nidxs.size());
-      // Zero memory
-      auto d_data = overflow_.data().get();
-      dh::LaunchN(overflow_.size(),
-                  [=] __device__(size_t idx) { d_data[idx] = 0.0; });
-      // Append new histograms
-      for (int nidx : new_nidxs) {
-        overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize();
-      }
-    } else {
-      CHECK_GE(data_.size(), used_size);
-      // Expand if necessary
-      if (data_.size() < new_used_size) {
-        data_.resize(std::max(data_.size() * 2, new_used_size));
-      }
-      // Append new histograms
-      for (int nidx : new_nidxs) {
-        nidx_map_[nidx] = nidx_map_.size() * HistogramSize();
-      }
-    }
-
-    CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize());
-  }
-
-  /**
-   * \summary   Return pointer to histogram memory for a given node.
-   * \param nidx    Tree node index.
-   * \return    hist pointer.
-   */
-  common::Span<GradientSumT> GetNodeHistogram(int nidx) {
-    CHECK(this->HistogramExists(nidx));
-
-    if (nidx_map_.find(nidx) != nidx_map_.cend()) {
-      // Fetch from normal cache
-      auto ptr = data_.data().get() + nidx_map_.at(nidx);
-      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
-    } else {
-      // Fetch from overflow
-      auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
-      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
-    }
-  }
-};
-
 // Manage memory for a single GPU
 struct GPUHistMakerDevice {
  private:
@@ -258,7 +151,7 @@ struct GPUHistMakerDevice {
 
     // Init histogram
     hist.Init(ctx_->Device(), page->Cuts().TotalBins());
-    hist.Reset();
+    hist.Reset(ctx_);
 
     this->InitFeatureGroupsOnce();
 
@@ -657,7 +550,7 @@ struct GPUHistMakerDevice {
     all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end());
     // Allocate the histograms
     // Guaranteed contiguous memory
-    hist.AllocateHistograms(all_new);
+    hist.AllocateHistograms(ctx_, all_new);
 
     for (auto nidx : hist_nidx) {
       this->BuildHist(nidx);
@@ -748,7 +641,7 @@ struct GPUHistMakerDevice {
         ctx_, info_, linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2));
     collective::SafeColl(rc);
 
-    hist.AllocateHistograms({kRootNIdx});
+    hist.AllocateHistograms(ctx_, {kRootNIdx});
     this->BuildHist(kRootNIdx);
     this->AllReduceHist(kRootNIdx, 1);
 
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 9b988f9605bd..eebbaf8ef795 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -763,4 +763,6 @@ void DeleteRMMResource(RMMAllocator*) {}
 
 RMMAllocatorPtr SetUpRMMResourceForCppTests(int, char**) { return {nullptr, DeleteRMMResource}; }
 #endif  // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1
+
+std::int32_t DistGpuIdx() { return common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); }
 } // namespace xgboost
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index b2e9e08cd80c..2821a11380c8 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -526,6 +526,9 @@ inline std::int32_t AllThreadsForTest() { return Context{}.Threads(); }
 
 inline DeviceOrd FstCU() { return DeviceOrd::CUDA(0); }
 
+// GPU device ordinal for distributed tests
+std::int32_t DistGpuIdx();
+
 inline auto GMockThrow(StringView msg) {
   return ::testing::ThrowsMessage<dmlc::Error>(::testing::HasSubstr(msg));
 }
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 860e4bfd4ea0..c9320f616983 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -14,6 +14,46 @@
 #include "../../helpers.h"
 
 namespace xgboost::tree {
+TEST(Histogram, DeviceHistogramStorage) {
+  // Ensures that node allocates correctly after reaching `kStopGrowingSize`.
+  auto ctx = MakeCUDACtx(0);
+  constexpr size_t kNBins = 128;
+  constexpr int kNNodes = 4;
+  constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
+  DeviceHistogramStorage<kStopGrowing> histogram;
+  histogram.Init(FstCU(), kNBins);
+  for (int i = 0; i < kNNodes; ++i) {
+    histogram.AllocateHistograms(&ctx, {i});
+  }
+  histogram.Reset(&ctx);
+  ASSERT_EQ(histogram.Data().size(), kStopGrowing);
+
+  // Use allocated memory but do not erase nidx_map.
+  for (int i = 0; i < kNNodes; ++i) {
+    histogram.AllocateHistograms(&ctx, {i});
+  }
+  for (int i = 0; i < kNNodes; ++i) {
+    ASSERT_TRUE(histogram.HistogramExists(i));
+  }
+
+  // Add two new nodes
+  histogram.AllocateHistograms(&ctx, {kNNodes});
+  histogram.AllocateHistograms(&ctx, {kNNodes + 1});
+
+  // Old cached nodes should still exist
+  for (int i = 0; i < kNNodes; ++i) {
+    ASSERT_TRUE(histogram.HistogramExists(i));
+  }
+
+  // Should be deleted
+  ASSERT_FALSE(histogram.HistogramExists(kNNodes));
+  // Most recent node should exist
+  ASSERT_TRUE(histogram.HistogramExists(kNNodes + 1));
+
+  // Add same node again - should fail
+  EXPECT_ANY_THROW(histogram.AllocateHistograms(&ctx, {kNNodes + 1}););
+}
+
 void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) {
   Context ctx = MakeCUDACtx(0);
   size_t constexpr kBins = 256, kCols = 120, kRows = 16384, kRounds = 16;
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 291b46edea36..728fb62c46d4 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -6,7 +6,6 @@
 #include <thrust/host_vector.h>
 #include <xgboost/base.h>
 
-#include <random>
 #include <string>
 #include <vector>
 
@@ -23,46 +22,6 @@
 #include "xgboost/json.h"
 
 namespace xgboost::tree {
-TEST(GpuHist, DeviceHistogramStorage) {
-  // Ensures that node allocates correctly after reaching `kStopGrowingSize`.
-  dh::safe_cuda(cudaSetDevice(0));
-  constexpr size_t kNBins = 128;
-  constexpr int kNNodes = 4;
-  constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
-  DeviceHistogramStorage<kStopGrowing> histogram;
-  histogram.Init(FstCU(), kNBins);
-  for (int i = 0; i < kNNodes; ++i) {
-    histogram.AllocateHistograms({i});
-  }
-  histogram.Reset();
-  ASSERT_EQ(histogram.Data().size(), kStopGrowing);
-
-  // Use allocated memory but do not erase nidx_map.
-  for (int i = 0; i < kNNodes; ++i) {
-    histogram.AllocateHistograms({i});
-  }
-  for (int i = 0; i < kNNodes; ++i) {
-    ASSERT_TRUE(histogram.HistogramExists(i));
-  }
-
-  // Add two new nodes
-  histogram.AllocateHistograms({kNNodes});
-  histogram.AllocateHistograms({kNNodes + 1});
-
-  // Old cached nodes should still exist
-  for (int i = 0; i < kNNodes; ++i) {
-    ASSERT_TRUE(histogram.HistogramExists(i));
-  }
-
-  // Should be deleted
-  ASSERT_FALSE(histogram.HistogramExists(kNNodes));
-  // Most recent node should exist
-  ASSERT_TRUE(histogram.HistogramExists(kNNodes + 1));
-
-  // Add same node again - should fail
-  EXPECT_ANY_THROW(histogram.AllocateHistograms({kNNodes + 1}););
-}
-
 std::vector<GradientPairPrecise> GetHostHistGpair() {
   // 24 bins, 3 bins for each feature (column).
   std::vector<GradientPairPrecise> hist_gpair = {
@@ -108,7 +67,7 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   maker.row_partitioner = std::make_unique<RowPartitioner>(&ctx, kNRows, 0);
 
   maker.hist.Init(ctx.Device(), page->Cuts().TotalBins());
-  maker.hist.AllocateHistograms({0});
+  maker.hist.AllocateHistograms(&ctx, {0});
 
   maker.gpair = gpair.DeviceSpan();
   maker.quantiser = std::make_unique<GradientQuantiser>(&ctx, maker.gpair, MetaInfo());
@@ -425,8 +384,8 @@ TEST(GpuHist, MaxDepth) {
 namespace {
 RegTree GetHistTree(Context const* ctx, DMatrix* dmat) {
   ObjInfo task{ObjInfo::kRegression};
-  GPUHistMaker hist_maker{ctx, &task};
-  hist_maker.Configure(Args{});
+  std::unique_ptr<TreeUpdater> hist_maker {TreeUpdater::Create("grow_gpu_hist", ctx, &task)};
+  hist_maker->Configure(Args{});
 
   TrainParam param;
   param.UpdateAllowUnknown(Args{});
@@ -436,8 +395,8 @@ RegTree GetHistTree(Context const* ctx, DMatrix* dmat) {
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
   RegTree tree;
-  hist_maker.Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
-                    {&tree});
+  hist_maker->Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
+                     {&tree});
   return tree;
 }
 
@@ -476,8 +435,8 @@ TEST_F(MGPUHistTest, HistColumnSplit) {
 namespace {
 RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) {
   ObjInfo task{ObjInfo::kRegression};
-  GPUGlobalApproxMaker approx_maker{ctx, &task};
-  approx_maker.Configure(Args{});
+  std::unique_ptr<TreeUpdater> approx_maker{TreeUpdater::Create("grow_gpu_approx", ctx, &task)};
+  approx_maker->Configure(Args{});
 
   TrainParam param;
   param.UpdateAllowUnknown(Args{});
@@ -487,13 +446,13 @@ RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) {
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
   RegTree tree;
-  approx_maker.Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
-                      {&tree});
+  approx_maker->Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
+                       {&tree});
   return tree;
 }
 
 void VerifyApproxColumnSplit(bst_idx_t rows, bst_feature_t cols, RegTree const& expected_tree) {
-  Context ctx(MakeCUDACtx(GPUIDX));
+  auto ctx = MakeCUDACtx(DistGpuIdx());
 
   auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
   auto const world_size = collective::GetWorldSize();

From ab022eed0dfd49fd1a08547fdfe5af245f7b6ba0 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 23 Jul 2024 02:26:18 +0800
Subject: [PATCH 50/51] Fix.

---
 src/common/host_device_vector.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index da4a55285765..c17a2b4d3eba 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -29,7 +29,7 @@ class HostDeviceVectorImpl {
     if (device.IsCUDA()) {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
-      data_d_->Resize(size, v);
+      data_d_->resize(size, v);
     } else {
       data_h_.resize(size, v);
     }

From 1f658b2296edce62b6dcd9aed26af5f797b08d50 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 23 Jul 2024 02:32:26 +0800
Subject: [PATCH 51/51] Fix.

---
 python-package/xgboost/testing/__init__.py | 29 ----------------------
 1 file changed, 29 deletions(-)

diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 0ed1f3e68431..b934e99e7981 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -258,35 +258,6 @@ def as_arrays(
         return X, y, w
 
 
-def make_batches(  # pylint: disable=too-many-arguments,too-many-locals
-    n_samples_per_batch: int,
-    n_features: int,
-    n_batches: int,
-    use_cupy: bool = False,
-    *,
-    vary_size: bool = False,
-    random_state: int = 1994,
-) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
-    X = []
-    y = []
-    w = []
-    if use_cupy:
-        import cupy
-
-        rng = cupy.random.RandomState(random_state)
-    else:
-        rng = np.random.RandomState(random_state)
-    for i in range(n_batches):
-        n_samples = n_samples_per_batch + i * 10 if vary_size else n_samples_per_batch
-        _X = rng.randn(n_samples, n_features)
-        _y = rng.randn(n_samples)
-        _w = rng.uniform(low=0, high=1, size=n_samples)
-        X.append(_X)
-        y.append(_y)
-        w.append(_w)
-    return X, y, w
-
-
 def make_regression(
     n_samples: int, n_features: int, use_cupy: bool
 ) -> Tuple[ArrayLike, ArrayLike, ArrayLike]: