From c182c584ca1736f34941f66ef4364e9893aff6d4 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 13 Feb 2024 15:29:09 -0800
Subject: [PATCH 1/8] [CI] Update RAPIDS to latest stable (#10042)

Co-authored-by: hcho3 <hcho3@users.noreply.github.com>
---
 tests/buildkite/conftest.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh
index 3df79b58d578..c6e8ef65a0e4 100755
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@@ -24,7 +24,7 @@ set -x
 
 CUDA_VERSION=11.8.0
 NCCL_VERSION=2.16.5-1
-RAPIDS_VERSION=23.12
+RAPIDS_VERSION=24.02
 SPARK_VERSION=3.4.0
 JDK_VERSION=8
 R_VERSION=4.3.2

From 7cc256e246e68a6c641ecb57a138e3c8a721c55e Mon Sep 17 00:00:00 2001
From: UncleLLD <duanzhch@tju.edu.cn>
Date: Thu, 15 Feb 2024 02:01:38 +0800
Subject: [PATCH 2/8] update python intro doc (#10033)

---
 doc/python/python_intro.rst | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/doc/python/python_intro.rst b/doc/python/python_intro.rst
index 0d26a5253a50..cfdd20da074d 100644
--- a/doc/python/python_intro.rst
+++ b/doc/python/python_intro.rst
@@ -63,7 +63,7 @@ The input data is stored in a :py:class:`DMatrix <xgboost.DMatrix>` object. For
 
   .. code-block:: python
 
-    dtrain = xgb.DMatrix('train.svm.txt')
+    dtrain = xgb.DMatrix('train.svm.txt?format=libsvm')
     dtrain.save_binary('train.buffer')
 
 * Missing values can be replaced by a default value in the :py:class:`DMatrix <xgboost.DMatrix>` constructor:
@@ -86,7 +86,7 @@ to number of groups.
 
   .. code-block:: python
 
-    dtrain = xgb.DMatrix('train.svm.txt')
+    dtrain = xgb.DMatrix('train.svm.txt?format=libsvm')
     dtest = xgb.DMatrix('test.svm.buffer')
 
   The parser in XGBoost has limited functionality. When using Python interface, it's
@@ -176,7 +176,6 @@ Support Matrix
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
 | pyarrow.Table           | NPA       | NPA               | NPA       | NPA       | NPA                | NPA         |
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
 | _\_array\_\_            | NPA       | F                 | NPA       | NPA       | H                  |             |
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
 | Others                  | SciCSR    | F                 |           | F         | F                  |             |
@@ -240,7 +239,7 @@ A saved model can be loaded as follows:
 .. code-block:: python
 
   bst = xgb.Booster({'nthread': 4})  # init model
-  bst.load_model('model.bin')  # load data
+  bst.load_model('model.bin')  # load model data
 
 Methods including `update` and `boost` from `xgboost.Booster` are designed for
 internal usage only.  The wrapper function `xgboost.train` does some

From 057f03cacce1032c95eb335ee525bec564fad347 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <dmitry.razdoburdin@intel.com>
Date: Sun, 18 Feb 2024 21:27:15 +0100
Subject: [PATCH 3/8] [SYCL] Initial implementation of `GHistIndexMatrix`
 (#10045)

Co-authored-by: Dmitry Razdoburdin <>
---
 plugin/CMakeLists.txt                        |   7 +-
 plugin/sycl/data.h                           |   7 +-
 plugin/sycl/data/gradient_index.cc           | 177 +++++++++++++++
 plugin/sycl/data/gradient_index.h            | 216 +++++++++++++++++++
 tests/cpp/plugin/sycl_helpers.h              |  30 +++
 tests/cpp/plugin/test_sycl_gradient_index.cc |  79 +++++++
 6 files changed, 510 insertions(+), 6 deletions(-)
 create mode 100644 plugin/sycl/data/gradient_index.cc
 create mode 100644 plugin/sycl/data/gradient_index.h
 create mode 100644 tests/cpp/plugin/sycl_helpers.h
 create mode 100644 tests/cpp/plugin/test_sycl_gradient_index.cc

diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt
index e575f1a4197a..5d20e120e902 100644
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -1,10 +1,7 @@
 if(PLUGIN_SYCL)
   set(CMAKE_CXX_COMPILER "icpx")
-  add_library(plugin_sycl OBJECT
-    ${xgboost_SOURCE_DIR}/plugin/sycl/objective/regression_obj.cc
-    ${xgboost_SOURCE_DIR}/plugin/sycl/objective/multiclass_obj.cc
-    ${xgboost_SOURCE_DIR}/plugin/sycl/device_manager.cc
-    ${xgboost_SOURCE_DIR}/plugin/sycl/predictor/predictor.cc)
+  file(GLOB_RECURSE SYCL_SOURCES "sycl/*.cc")
+  add_library(plugin_sycl OBJECT ${SYCL_SOURCES})
   target_include_directories(plugin_sycl
     PRIVATE
     ${xgboost_SOURCE_DIR}/include
diff --git a/plugin/sycl/data.h b/plugin/sycl/data.h
index 489fde989823..37d5842bf9a4 100644
--- a/plugin/sycl/data.h
+++ b/plugin/sycl/data.h
@@ -26,8 +26,13 @@
 
 namespace xgboost {
 namespace sycl {
-enum class MemoryType { shared, on_device};
+template <typename T>
+using AtomicRef = ::sycl::atomic_ref<T,
+                                    ::sycl::memory_order::relaxed,
+                                    ::sycl::memory_scope::device,
+                                    ::sycl::access::address_space::ext_intel_global_device_space>;
 
+enum class MemoryType { shared, on_device};
 
 template <typename T>
 class USMDeleter {
diff --git a/plugin/sycl/data/gradient_index.cc b/plugin/sycl/data/gradient_index.cc
new file mode 100644
index 000000000000..49b66a71052f
--- /dev/null
+++ b/plugin/sycl/data/gradient_index.cc
@@ -0,0 +1,177 @@
+/*!
+ * Copyright 2017-2024 by Contributors
+ * \file gradient_index.cc
+ */
+#include <vector>
+#include <limits>
+#include <algorithm>
+
+#include "gradient_index.h"
+
+#include <CL/sycl.hpp>
+
+namespace xgboost {
+namespace sycl {
+namespace common {
+
+uint32_t SearchBin(const bst_float* cut_values, const uint32_t* cut_ptrs, Entry const& e) {
+  auto beg = cut_ptrs[e.index];
+  auto end = cut_ptrs[e.index + 1];
+  auto it = std::upper_bound(cut_values + beg, cut_values + end, e.fvalue);
+  uint32_t idx = it - cut_values;
+  if (idx == end) {
+    idx -= 1;
+  }
+  return idx;
+}
+
+template <typename BinIdxType>
+void mergeSort(BinIdxType* begin, BinIdxType* end, BinIdxType* buf) {
+  const size_t total_len = end - begin;
+  for (size_t block_len = 1; block_len < total_len; block_len <<= 1) {
+    for (size_t cur_block = 0; cur_block + block_len < total_len; cur_block += 2 * block_len) {
+      size_t start = cur_block;
+      size_t mid = start + block_len;
+      size_t finish = mid + block_len < total_len ? mid + block_len : total_len;
+      size_t left_pos = start;
+      size_t right_pos = mid;
+      size_t pos = start;
+      while (left_pos < mid || right_pos < finish) {
+        if (left_pos < mid && (right_pos == finish || begin[left_pos] < begin[right_pos])) {
+          buf[pos++] = begin[left_pos++];
+        } else {
+          buf[pos++] = begin[right_pos++];
+        }
+      }
+      for (size_t i = start; i < finish; i++) begin[i] = buf[i];
+    }
+  }
+}
+
+template <typename BinIdxType>
+void GHistIndexMatrix::SetIndexData(::sycl::queue qu,
+                                    BinIdxType* index_data,
+                                    const DeviceMatrix &dmat,
+                                    size_t nbins,
+                                    size_t row_stride,
+                                    uint32_t* offsets) {
+  if (nbins == 0) return;
+  const xgboost::Entry *data_ptr = dmat.data.DataConst();
+  const bst_row_t *offset_vec = dmat.row_ptr.DataConst();
+  const size_t num_rows = dmat.row_ptr.Size() - 1;
+  const bst_float* cut_values = cut_device.Values().DataConst();
+  const uint32_t* cut_ptrs = cut_device.Ptrs().DataConst();
+  size_t* hit_count_ptr = hit_count_buff.Data();
+
+  // Sparse case only
+  if (!offsets) {
+    // sort_buff has type uint8_t
+    sort_buff.Resize(&qu, num_rows * row_stride * sizeof(BinIdxType));
+  }
+  BinIdxType* sort_data = reinterpret_cast<BinIdxType*>(sort_buff.Data());
+
+  auto event = qu.submit([&](::sycl::handler& cgh) {
+    cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::item<1> pid) {
+      const size_t i = pid.get_id(0);
+      const size_t ibegin = offset_vec[i];
+      const size_t iend = offset_vec[i + 1];
+      const size_t size = iend - ibegin;
+      const size_t start = i * row_stride;
+      for (bst_uint j = 0; j < size; ++j) {
+        uint32_t idx = SearchBin(cut_values, cut_ptrs, data_ptr[ibegin + j]);
+        index_data[start + j] = offsets ? idx - offsets[j] : idx;
+        AtomicRef<size_t> hit_count_ref(hit_count_ptr[idx]);
+        hit_count_ref.fetch_add(1);
+      }
+      if (!offsets) {
+        // Sparse case only
+        mergeSort<BinIdxType>(index_data + start, index_data + start + size, sort_data + start);
+        for (bst_uint j = size; j < row_stride; ++j) {
+          index_data[start + j] = nbins;
+        }
+      }
+    });
+  });
+  qu.memcpy(hit_count.data(), hit_count_ptr, nbins * sizeof(size_t), event);
+  qu.wait();
+}
+
+void GHistIndexMatrix::ResizeIndex(size_t n_index, bool isDense) {
+  if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
+    index.SetBinTypeSize(BinTypeSize::kUint8BinsTypeSize);
+    index.Resize((sizeof(uint8_t)) * n_index);
+  } else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max())  &&
+    max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) && isDense) {
+    index.SetBinTypeSize(BinTypeSize::kUint16BinsTypeSize);
+    index.Resize((sizeof(uint16_t)) * n_index);
+  } else {
+    index.SetBinTypeSize(BinTypeSize::kUint32BinsTypeSize);
+    index.Resize((sizeof(uint32_t)) * n_index);
+  }
+}
+
+void GHistIndexMatrix::Init(::sycl::queue qu,
+                            Context const * ctx,
+                            const DeviceMatrix& p_fmat_device,
+                            int max_bins) {
+  nfeatures = p_fmat_device.p_mat->Info().num_col_;
+
+  cut = xgboost::common::SketchOnDMatrix(ctx, p_fmat_device.p_mat, max_bins);
+  cut_device.Init(qu, cut);
+
+  max_num_bins = max_bins;
+  const uint32_t nbins = cut.Ptrs().back();
+  this->nbins = nbins;
+  hit_count.resize(nbins, 0);
+  hit_count_buff.Resize(&qu, nbins, 0);
+
+  this->p_fmat = p_fmat_device.p_mat;
+  const bool isDense = p_fmat_device.p_mat->IsDense();
+  this->isDense_ = isDense;
+
+  index.setQueue(qu);
+
+  row_stride = 0;
+  for (const auto& batch : p_fmat_device.p_mat->GetBatches<SparsePage>()) {
+    const auto& row_offset = batch.offset.ConstHostVector();
+    for (auto i = 1ull; i < row_offset.size(); i++) {
+      row_stride = std::max(row_stride, static_cast<size_t>(row_offset[i] - row_offset[i - 1]));
+    }
+  }
+
+  const size_t n_offsets = cut_device.Ptrs().Size() - 1;
+  const size_t n_rows = p_fmat_device.row_ptr.Size() - 1;
+  const size_t n_index = n_rows * row_stride;
+  ResizeIndex(n_index, isDense);
+
+  CHECK_GT(cut_device.Values().Size(), 0U);
+
+  uint32_t* offsets = nullptr;
+  if (isDense) {
+    index.ResizeOffset(n_offsets);
+    offsets = index.Offset();
+    qu.memcpy(offsets, cut_device.Ptrs().DataConst(),
+              sizeof(uint32_t) * n_offsets).wait_and_throw();
+  }
+
+  if (isDense) {
+    BinTypeSize curent_bin_size = index.GetBinTypeSize();
+    if (curent_bin_size == BinTypeSize::kUint8BinsTypeSize) {
+      SetIndexData(qu, index.data<uint8_t>(), p_fmat_device, nbins, row_stride, offsets);
+
+    } else if (curent_bin_size == BinTypeSize::kUint16BinsTypeSize) {
+      SetIndexData(qu, index.data<uint16_t>(), p_fmat_device, nbins, row_stride, offsets);
+    } else {
+      CHECK_EQ(curent_bin_size, BinTypeSize::kUint32BinsTypeSize);
+      SetIndexData(qu, index.data<uint32_t>(), p_fmat_device, nbins, row_stride, offsets);
+    }
+  /* For sparse DMatrix we have to store index of feature for each bin
+     in index field to chose right offset. So offset is nullptr and index is not reduced */
+  } else {
+    SetIndexData(qu, index.data<uint32_t>(), p_fmat_device, nbins, row_stride, offsets);
+  }
+}
+
+}  // namespace common
+}  // namespace sycl
+}  // namespace xgboost
diff --git a/plugin/sycl/data/gradient_index.h b/plugin/sycl/data/gradient_index.h
new file mode 100644
index 000000000000..13577025caa0
--- /dev/null
+++ b/plugin/sycl/data/gradient_index.h
@@ -0,0 +1,216 @@
+/*!
+ * Copyright 2017-2024 by Contributors
+ * \file gradient_index.h
+ */
+#ifndef PLUGIN_SYCL_DATA_GRADIENT_INDEX_H_
+#define PLUGIN_SYCL_DATA_GRADIENT_INDEX_H_
+
+#include <vector>
+
+#include "../data.h"
+#include "../../src/common/hist_util.h"
+
+#include <CL/sycl.hpp>
+
+namespace xgboost {
+namespace sycl {
+namespace common {
+
+/*!
+ * \brief SYCL implementation of HistogramCuts stored in USM buffers to provide access from device kernels
+ */
+class HistogramCuts {
+ protected:
+  using BinIdx = uint32_t;
+
+ public:
+  HistogramCuts() {}
+
+  explicit HistogramCuts(::sycl::queue qu) {}
+
+  ~HistogramCuts() {
+  }
+
+  void Init(::sycl::queue qu, xgboost::common::HistogramCuts const& cuts) {
+    qu_ = qu;
+    cut_values_.Init(&qu_, cuts.cut_values_.HostVector());
+    cut_ptrs_.Init(&qu_, cuts.cut_ptrs_.HostVector());
+    min_vals_.Init(&qu_, cuts.min_vals_.HostVector());
+  }
+
+  // Getters for USM buffers to pass pointers into device kernels
+  const USMVector<uint32_t>& Ptrs()      const { return cut_ptrs_;   }
+  const USMVector<float>&    Values()    const { return cut_values_; }
+  const USMVector<float>&    MinValues() const { return min_vals_;   }
+
+ private:
+  USMVector<bst_float> cut_values_;
+  USMVector<uint32_t> cut_ptrs_;
+  USMVector<float> min_vals_;
+  ::sycl::queue qu_;
+};
+
+using BinTypeSize = ::xgboost::common::BinTypeSize;
+
+/*!
+ * \brief Index data and offsets stored in USM buffers to provide access from device kernels
+ */
+struct Index {
+  Index() {
+    SetBinTypeSize(binTypeSize_);
+  }
+  Index(const Index& i) = delete;
+  Index& operator=(Index i) = delete;
+  Index(Index&& i) = delete;
+  Index& operator=(Index&& i) = delete;
+  uint32_t operator[](size_t i) const {
+    if (!offset_.Empty()) {
+      return func_(data_.DataConst(), i) + offset_[i%p_];
+    } else {
+      return func_(data_.DataConst(), i);
+    }
+  }
+  void SetBinTypeSize(BinTypeSize binTypeSize) {
+    binTypeSize_ = binTypeSize;
+    switch (binTypeSize) {
+      case BinTypeSize::kUint8BinsTypeSize:
+        func_ = &GetValueFromUint8;
+        break;
+      case BinTypeSize::kUint16BinsTypeSize:
+        func_ = &GetValueFromUint16;
+        break;
+      case BinTypeSize::kUint32BinsTypeSize:
+        func_ = &GetValueFromUint32;
+        break;
+      default:
+        CHECK(binTypeSize == BinTypeSize::kUint8BinsTypeSize  ||
+              binTypeSize == BinTypeSize::kUint16BinsTypeSize ||
+              binTypeSize == BinTypeSize::kUint32BinsTypeSize);
+    }
+  }
+  BinTypeSize GetBinTypeSize() const {
+    return binTypeSize_;
+  }
+
+  template<typename T>
+  T* data() {
+    return reinterpret_cast<T*>(data_.Data());
+  }
+
+  template<typename T>
+  const T* data() const {
+    return reinterpret_cast<const T*>(data_.DataConst());
+  }
+
+  uint32_t* Offset() {
+    return offset_.Data();
+  }
+
+  const uint32_t* Offset() const {
+    return offset_.DataConst();
+  }
+
+  size_t Size() const {
+    return data_.Size() / (binTypeSize_);
+  }
+
+  void Resize(const size_t nBytesData) {
+    data_.Resize(&qu_, nBytesData);
+  }
+
+  void ResizeOffset(const size_t nDisps) {
+    offset_.Resize(&qu_, nDisps);
+    p_ = nDisps;
+  }
+
+  uint8_t* begin() const {
+    return data_.Begin();
+  }
+
+  uint8_t* end() const {
+    return data_.End();
+  }
+
+  void setQueue(::sycl::queue qu) {
+    qu_ = qu;
+  }
+
+ private:
+  static uint32_t GetValueFromUint8(const uint8_t* t, size_t i) {
+    return reinterpret_cast<const uint8_t*>(t)[i];
+  }
+  static uint32_t GetValueFromUint16(const uint8_t* t, size_t i) {
+    return reinterpret_cast<const uint16_t*>(t)[i];
+  }
+  static uint32_t GetValueFromUint32(const uint8_t* t, size_t i) {
+    return reinterpret_cast<const uint32_t*>(t)[i];
+  }
+
+  using Func = uint32_t (*)(const uint8_t*, size_t);
+
+  USMVector<uint8_t, MemoryType::on_device> data_;
+  // size of this field is equal to number of features
+  USMVector<uint32_t, MemoryType::on_device> offset_;
+  BinTypeSize binTypeSize_ {BinTypeSize::kUint8BinsTypeSize};
+  size_t p_ {1};
+  Func func_;
+
+  ::sycl::queue qu_;
+};
+
+/*!
+ * \brief Preprocessed global index matrix, in CSR format, stored in USM buffers
+ *
+ *  Transform floating values to integer index in histogram
+ */
+struct GHistIndexMatrix {
+  /*! \brief row pointer to rows by element position */
+  /*! \brief The index data */
+  Index index;
+  /*! \brief hit count of each index */
+  std::vector<size_t> hit_count;
+  /*! \brief buffers for calculations */
+  USMVector<size_t, MemoryType::on_device> hit_count_buff;
+  USMVector<uint8_t, MemoryType::on_device> sort_buff;
+  /*! \brief The corresponding cuts */
+  xgboost::common::HistogramCuts cut;
+  HistogramCuts cut_device;
+  DMatrix* p_fmat;
+  size_t max_num_bins;
+  size_t nbins;
+  size_t nfeatures;
+  size_t row_stride;
+
+  // Create a global histogram matrix based on a given DMatrix device wrapper
+  void Init(::sycl::queue qu, Context const * ctx,
+            const sycl::DeviceMatrix& p_fmat_device, int max_num_bins);
+
+  template <typename BinIdxType>
+  void SetIndexData(::sycl::queue qu, BinIdxType* index_data,
+                    const sycl::DeviceMatrix &dmat_device,
+                    size_t nbins, size_t row_stride, uint32_t* offsets);
+
+  void ResizeIndex(size_t n_index, bool isDense);
+
+  inline void GetFeatureCounts(size_t* counts) const {
+    auto nfeature = cut_device.Ptrs().Size() - 1;
+    for (unsigned fid = 0; fid < nfeature; ++fid) {
+      auto ibegin = cut_device.Ptrs()[fid];
+      auto iend = cut_device.Ptrs()[fid + 1];
+      for (auto i = ibegin; i < iend; ++i) {
+        *(counts + fid) += hit_count[i];
+      }
+    }
+  }
+  inline bool IsDense() const {
+    return isDense_;
+  }
+
+ private:
+  bool isDense_;
+};
+
+}  // namespace common
+}  // namespace sycl
+}  // namespace xgboost
+#endif  // PLUGIN_SYCL_DATA_GRADIENT_INDEX_H_
diff --git a/tests/cpp/plugin/sycl_helpers.h b/tests/cpp/plugin/sycl_helpers.h
new file mode 100644
index 000000000000..c5cdd3ea5b08
--- /dev/null
+++ b/tests/cpp/plugin/sycl_helpers.h
@@ -0,0 +1,30 @@
+/*!
+ * Copyright 2022-2024 XGBoost contributors
+ */
+#pragma once
+
+#include "../helpers.h"
+
+namespace xgboost::sycl {
+template<typename T, typename Container>
+void VerifySyclVector(const USMVector<T, MemoryType::shared>& sycl_vector,
+                      const Container& host_vector) {
+  ASSERT_EQ(sycl_vector.Size(), host_vector.size());
+
+  size_t size = sycl_vector.Size();
+  for (size_t i = 0; i < size; ++i) {
+    ASSERT_EQ(sycl_vector[i], host_vector[i]);
+  }
+}
+
+template<typename T, typename Container>
+void VerifySyclVector(const std::vector<T>& sycl_vector, const Container& host_vector) {
+  ASSERT_EQ(sycl_vector.size(), host_vector.size());
+
+  size_t size = sycl_vector.size();
+  for (size_t i = 0; i < size; ++i) {
+    ASSERT_EQ(sycl_vector[i], host_vector[i]);
+  }
+}
+
+}  // namespace xgboost::sycl
diff --git a/tests/cpp/plugin/test_sycl_gradient_index.cc b/tests/cpp/plugin/test_sycl_gradient_index.cc
new file mode 100644
index 000000000000..35fc7fbbe345
--- /dev/null
+++ b/tests/cpp/plugin/test_sycl_gradient_index.cc
@@ -0,0 +1,79 @@
+/**
+ * Copyright 2021-2024 by XGBoost contributors
+ */
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include "../../../src/data/gradient_index.h"       // for GHistIndexMatrix
+#pragma GCC diagnostic pop
+
+#include "../../../plugin/sycl/data/gradient_index.h"
+#include "../../../plugin/sycl/device_manager.h"
+#include "sycl_helpers.h"
+#include "../helpers.h"
+
+namespace xgboost::sycl::data {
+
+TEST(SyclGradientIndex, HistogramCuts) {
+  size_t max_bins = 8;
+
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+
+  DeviceManager device_manager;
+  auto qu = device_manager.GetQueue(ctx.Device());
+
+  auto p_fmat = RandomDataGenerator{512, 16, 0.5}.GenerateDMatrix(true);
+
+  xgboost::common::HistogramCuts cut = 
+    xgboost::common::SketchOnDMatrix(&ctx, p_fmat.get(), max_bins);
+
+  common::HistogramCuts cut_sycl;
+  cut_sycl.Init(qu, cut);
+
+  VerifySyclVector(cut_sycl.Ptrs(), cut.cut_ptrs_.HostVector());
+  VerifySyclVector(cut_sycl.Values(), cut.cut_values_.HostVector());
+  VerifySyclVector(cut_sycl.MinValues(), cut.min_vals_.HostVector());
+}
+
+TEST(SyclGradientIndex, Init) {
+  size_t n_rows = 128;
+  size_t n_columns = 7;
+
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+
+  DeviceManager device_manager;
+  auto qu = device_manager.GetQueue(ctx.Device());
+
+  auto p_fmat = RandomDataGenerator{n_rows, n_columns, 0.3}.GenerateDMatrix();
+
+  sycl::DeviceMatrix dmat(qu, p_fmat.get());
+
+  int max_bins = 256;
+  common::GHistIndexMatrix gmat_sycl;
+  gmat_sycl.Init(qu, &ctx, dmat, max_bins);
+
+  xgboost::GHistIndexMatrix gmat{&ctx, p_fmat.get(), max_bins, 0.3, false};
+
+  {
+    ASSERT_EQ(gmat_sycl.max_num_bins, max_bins);
+    ASSERT_EQ(gmat_sycl.nfeatures, n_columns);
+  }
+
+  {
+    VerifySyclVector(gmat_sycl.hit_count, gmat.hit_count);
+  }
+
+  {
+    std::vector<size_t> feature_count_sycl(n_columns, 0);
+    gmat_sycl.GetFeatureCounts(feature_count_sycl.data());
+
+    std::vector<size_t> feature_count(n_columns, 0);
+    gmat.GetFeatureCounts(feature_count.data());
+    VerifySyclVector(feature_count_sycl, feature_count);
+  }
+}
+
+}  // namespace xgboost::sycl::data

From edf501d22767df8b1ac251877d0a3b2e1c7dc300 Mon Sep 17 00:00:00 2001
From: Louis Desreumaux <louis.desreumaux@sap.com>
Date: Mon, 19 Feb 2024 14:03:29 +0100
Subject: [PATCH 4/8] Implement contribution prediction with QuantileDMatrix
 (#10043)

---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 src/predictor/cpu_predictor.cc            | 128 +++++++++++++---------
 src/predictor/gpu_predictor.cu            |   6 +
 tests/cpp/predictor/test_cpu_predictor.cc |   2 +-
 tests/cpp/predictor/test_predictor.cc     |  25 ++++-
 tests/cpp/predictor/test_predictor.h      |   3 +-
 tests/python/test_shap.py                 |  28 ++++-
 6 files changed, 137 insertions(+), 55 deletions(-)

diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index d97b527f0ce8..06b8079ee134 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -698,6 +698,67 @@ class CPUPredictor : public Predictor {
     }
   }
 
+  template <typename DataView>
+  void PredictContributionKernel(DataView batch, const MetaInfo& info,
+                                 const gbm::GBTreeModel& model,
+                                 const std::vector<bst_float>* tree_weights,
+                                 std::vector<std::vector<float>>* mean_values,
+                                 std::vector<RegTree::FVec>* feat_vecs,
+                                 std::vector<bst_float>* contribs, uint32_t ntree_limit,
+                                 bool approximate, int condition,
+                                 unsigned condition_feature) const {
+    const int num_feature = model.learner_model_param->num_feature;
+    const int ngroup = model.learner_model_param->num_output_group;
+    CHECK_NE(ngroup, 0);
+    size_t const ncolumns = num_feature + 1;
+    CHECK_NE(ncolumns, 0);
+    auto base_margin = info.base_margin_.View(ctx_->Device());
+    auto base_score = model.learner_model_param->BaseScore(ctx_->Device())(0);
+
+    // parallel over local batch
+    common::ParallelFor(batch.Size(), this->ctx_->Threads(), [&](auto i) {
+      auto row_idx = batch.base_rowid + i;
+      RegTree::FVec &feats = (*feat_vecs)[omp_get_thread_num()];
+      if (feats.Size() == 0) {
+        feats.Init(num_feature);
+      }
+      std::vector<bst_float> this_tree_contribs(ncolumns);
+      // loop over all classes
+      for (int gid = 0; gid < ngroup; ++gid) {
+        bst_float* p_contribs = &(*contribs)[(row_idx * ngroup + gid) * ncolumns];
+        feats.Fill(batch[i]);
+        // calculate contributions
+        for (unsigned j = 0; j < ntree_limit; ++j) {
+          auto *tree_mean_values = &mean_values->at(j);
+          std::fill(this_tree_contribs.begin(), this_tree_contribs.end(), 0);
+          if (model.tree_info[j] != gid) {
+            continue;
+          }
+          if (!approximate) {
+            CalculateContributions(*model.trees[j], feats, tree_mean_values,
+                                   &this_tree_contribs[0], condition, condition_feature);
+          } else {
+            model.trees[j]->CalculateContributionsApprox(
+                feats, tree_mean_values, &this_tree_contribs[0]);
+          }
+          for (size_t ci = 0; ci < ncolumns; ++ci) {
+            p_contribs[ci] +=
+                this_tree_contribs[ci] *
+                (tree_weights == nullptr ? 1 : (*tree_weights)[j]);
+          }
+        }
+        feats.Drop();
+        // add base margin to BIAS
+        if (base_margin.Size() != 0) {
+          CHECK_EQ(base_margin.Shape(1), ngroup);
+          p_contribs[ncolumns - 1] += base_margin(row_idx, gid);
+        } else {
+          p_contribs[ncolumns - 1] += base_score;
+        }
+      }
+    });
+  }
+
  public:
   explicit CPUPredictor(Context const *ctx) : Predictor::Predictor{ctx} {}
 
@@ -861,7 +922,6 @@ class CPUPredictor : public Predictor {
     CHECK(!p_fmat->Info().IsColumnSplit())
         << "Predict contribution support for column-wise data split is not yet implemented.";
     auto const n_threads = this->ctx_->Threads();
-    const int num_feature = model.learner_model_param->num_feature;
     std::vector<RegTree::FVec> feat_vecs;
     InitThreadTemp(n_threads, &feat_vecs);
     const MetaInfo& info = p_fmat->Info();
@@ -869,10 +929,7 @@ class CPUPredictor : public Predictor {
     if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
       ntree_limit = static_cast<unsigned>(model.trees.size());
     }
-    const int ngroup = model.learner_model_param->num_output_group;
-    CHECK_NE(ngroup, 0);
-    size_t const ncolumns = num_feature + 1;
-    CHECK_NE(ncolumns, 0);
+    size_t const ncolumns = model.learner_model_param->num_feature + 1;
     // allocate space for (number of features + bias) times the number of rows
     std::vector<bst_float>& contribs = out_contribs->HostVector();
     contribs.resize(info.num_row_ * ncolumns * model.learner_model_param->num_output_group);
@@ -884,53 +941,22 @@ class CPUPredictor : public Predictor {
     common::ParallelFor(ntree_limit, n_threads, [&](bst_omp_uint i) {
       FillNodeMeanValues(model.trees[i].get(), &(mean_values[i]));
     });
-    auto base_margin = info.base_margin_.View(ctx_->Device());
-    auto base_score = model.learner_model_param->BaseScore(ctx_->Device())(0);
     // start collecting the contributions
-    for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
-      auto page = batch.GetView();
-      // parallel over local batch
-      common::ParallelFor(batch.Size(), n_threads, [&](auto i) {
-        auto row_idx = batch.base_rowid + i;
-        RegTree::FVec &feats = feat_vecs[omp_get_thread_num()];
-        if (feats.Size() == 0) {
-          feats.Init(num_feature);
-        }
-        std::vector<bst_float> this_tree_contribs(ncolumns);
-        // loop over all classes
-        for (int gid = 0; gid < ngroup; ++gid) {
-          bst_float* p_contribs = &contribs[(row_idx * ngroup + gid) * ncolumns];
-          feats.Fill(page[i]);
-          // calculate contributions
-          for (unsigned j = 0; j < ntree_limit; ++j) {
-            auto *tree_mean_values = &mean_values.at(j);
-            std::fill(this_tree_contribs.begin(), this_tree_contribs.end(), 0);
-            if (model.tree_info[j] != gid) {
-              continue;
-            }
-            if (!approximate) {
-              CalculateContributions(*model.trees[j], feats, tree_mean_values,
-                                     &this_tree_contribs[0], condition, condition_feature);
-            } else {
-              model.trees[j]->CalculateContributionsApprox(
-                  feats, tree_mean_values, &this_tree_contribs[0]);
-            }
-            for (size_t ci = 0; ci < ncolumns; ++ci) {
-              p_contribs[ci] +=
-                  this_tree_contribs[ci] *
-                  (tree_weights == nullptr ? 1 : (*tree_weights)[j]);
-            }
-          }
-          feats.Drop();
-          // add base margin to BIAS
-          if (base_margin.Size() != 0) {
-            CHECK_EQ(base_margin.Shape(1), ngroup);
-            p_contribs[ncolumns - 1] += base_margin(row_idx, gid);
-          } else {
-            p_contribs[ncolumns - 1] += base_score;
-          }
-        }
-      });
+    if (!p_fmat->PageExists<SparsePage>()) {
+      std::vector<Entry> workspace(info.num_col_ * kUnroll * n_threads);
+      auto ft = p_fmat->Info().feature_types.ConstHostVector();
+      for (const auto &batch : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, {})) {
+        PredictContributionKernel(
+            GHistIndexMatrixView{batch, info.num_col_, ft, workspace, n_threads},
+            info, model, tree_weights, &mean_values, &feat_vecs, &contribs, ntree_limit,
+            approximate, condition, condition_feature);
+      }
+    } else {
+      for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
+        PredictContributionKernel(
+            SparsePageView{&batch}, info, model, tree_weights, &mean_values, &feat_vecs,
+            &contribs, ntree_limit, approximate, condition, condition_feature);
+      }
     }
   }
 
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 7fad0739758a..7dcb5b5fc0f8 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -1042,6 +1042,9 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_weights != nullptr) {
       LOG(FATAL) << "Dart booster feature " << not_implemented;
     }
+    if (!p_fmat->PageExists<SparsePage>()) {
+      LOG(FATAL) << "SHAP value for QuantileDMatrix is not yet implemented for GPU.";
+    }
     CHECK(!p_fmat->Info().IsColumnSplit())
         << "Predict contribution support for column-wise data split is not yet implemented.";
     dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
@@ -1102,6 +1105,9 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_weights != nullptr) {
       LOG(FATAL) << "Dart booster feature " << not_implemented;
     }
+    if (!p_fmat->PageExists<SparsePage>()) {
+      LOG(FATAL) << "SHAP value for QuantileDMatrix is not yet implemented for GPU.";
+    }
     dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     out_contribs->SetDevice(ctx_->Device());
     if (tree_end == 0 || tree_end > model.trees.size()) {
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 8f3955c05648..669827ee4e92 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -148,7 +148,7 @@ TEST(CPUPredictor, GHistIndexTraining) {
   auto adapter = data::ArrayAdapter(columnar.c_str());
   std::shared_ptr<DMatrix> p_full{
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
-  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_hist);
+  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_hist, true);
 }
 
 TEST(CPUPredictor, CategoricalPrediction) {
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 6ee34ae69a70..0d715760853b 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -118,7 +118,8 @@ TEST(Predictor, PredictionCache) {
 }
 
 void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins,
-                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist) {
+                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist,
+                            bool check_contribs) {
   size_t constexpr kCols = 16;
   size_t constexpr kClasses = 3;
   size_t constexpr kIters = 3;
@@ -161,6 +162,28 @@ void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins,
   for (size_t i = 0; i < rows; ++i) {
     EXPECT_NEAR(from_hist.ConstHostVector()[i], from_full.ConstHostVector()[i], kRtEps);
   }
+
+  if (check_contribs) {
+    // Contributions
+    HostDeviceVector<float> from_full_contribs;
+    learner->Predict(p_full, false, &from_full_contribs, 0, 0, false, false, true);
+    HostDeviceVector<float> from_hist_contribs;
+    learner->Predict(p_hist, false, &from_hist_contribs, 0, 0, false, false, true);
+    for (size_t i = 0; i < from_full_contribs.ConstHostVector().size(); ++i) {
+      EXPECT_NEAR(from_hist_contribs.ConstHostVector()[i],
+                  from_full_contribs.ConstHostVector()[i], kRtEps);
+    }
+
+    // Contributions (approximate method)
+    HostDeviceVector<float> from_full_approx_contribs;
+    learner->Predict(p_full, false, &from_full_approx_contribs, 0, 0, false, false, false, true);
+    HostDeviceVector<float> from_hist_approx_contribs;
+    learner->Predict(p_hist, false, &from_hist_approx_contribs, 0, 0, false, false, false, true);
+    for (size_t i = 0; i < from_full_approx_contribs.ConstHostVector().size(); ++i) {
+      EXPECT_NEAR(from_hist_approx_contribs.ConstHostVector()[i],
+                  from_full_approx_contribs.ConstHostVector()[i], kRtEps);
+    }
+  }
 }
 
 void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_row_t rows,
diff --git a/tests/cpp/predictor/test_predictor.h b/tests/cpp/predictor/test_predictor.h
index c2b28883a6fc..a65b60579e61 100644
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -89,7 +89,8 @@ void TestBasic(DMatrix* dmat, Context const * ctx);
 
 // p_full and p_hist should come from the same data set.
 void TestTrainingPrediction(Context const* ctx, size_t rows, size_t bins,
-                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist);
+                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist,
+                            bool check_contribs = false);
 
 void TestInplacePrediction(Context const* ctx, std::shared_ptr<DMatrix> x, bst_row_t rows,
                            bst_feature_t cols);
diff --git a/tests/python/test_shap.py b/tests/python/test_shap.py
index 88149c05462d..097298f07d68 100644
--- a/tests/python/test_shap.py
+++ b/tests/python/test_shap.py
@@ -2,7 +2,6 @@
 import re
 
 import numpy as np
-import scipy
 import scipy.special
 
 import xgboost as xgb
@@ -256,3 +255,30 @@ def interaction_value(trees, x, i, j):
         brute_force[-1, -1] += base_score
         fast_method = bst.predict(xgb.DMatrix(X[0:1, :]), pred_interactions=True)
         assert np.linalg.norm(brute_force - fast_method[0, :, :]) < 1e-4
+
+    def test_shap_values(self) -> None:
+        from sklearn.datasets import make_classification, make_regression
+
+        def assert_same(X: np.ndarray, y: np.ndarray) -> None:
+            Xy = xgb.DMatrix(X, y)
+            booster = xgb.train({}, Xy, num_boost_round=4)
+            shap_dm = booster.predict(Xy, pred_contribs=True)
+            Xy = xgb.QuantileDMatrix(X, y)
+            shap_qdm = booster.predict(Xy, pred_contribs=True)
+            np.testing.assert_allclose(shap_dm, shap_qdm)
+
+            margin = booster.predict(Xy, output_margin=True)
+            np.testing.assert_allclose(
+                np.sum(shap_qdm, axis=len(shap_qdm.shape) - 1), margin, 1e-3, 1e-3
+            )
+
+            shap_dm = booster.predict(Xy, pred_interactions=True)
+            Xy = xgb.QuantileDMatrix(X, y)
+            shap_qdm = booster.predict(Xy, pred_interactions=True)
+            np.testing.assert_allclose(shap_dm, shap_qdm)
+
+        X, y = make_regression()
+        assert_same(X, y)
+
+        X, y = make_classification()
+        assert_same(X, y)

From 6e3c899ba72cfb2e72d50f585f31768f5541339e Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Tue, 20 Feb 2024 04:13:00 +0100
Subject: [PATCH 5/8] [R] Don't cap global number of threads for serialization
 (#10028)

---
 R-package/DESCRIPTION                         |  3 ++-
 R-package/R/xgb.DMatrix.save.R                |  1 +
 R-package/R/xgb.config.R                      |  7 +++++++
 R-package/R/xgb.dump.R                        |  1 +
 R-package/R/xgb.load.R                        |  1 +
 R-package/R/xgb.save.R                        |  1 +
 R-package/R/xgb.save.raw.R                    |  1 +
 R-package/demo/basic_walkthrough.R            |  2 ++
 R-package/man/xgb.DMatrix.save.Rd             |  1 +
 R-package/man/xgb.dump.Rd                     |  1 +
 R-package/man/xgb.load.Rd                     |  1 +
 R-package/man/xgb.save.Rd                     |  1 +
 R-package/man/xgb.save.raw.Rd                 |  1 +
 R-package/man/xgbConfig.Rd                    |  9 ++++++++
 R-package/tests/helper_scripts/install_deps.R |  1 +
 R-package/tests/testthat.R                    |  1 +
 R-package/vignettes/xgboostPresentation.Rmd   |  3 +++
 src/gbm/gbtree_model.cc                       | 21 ++-----------------
 18 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 66e2b5692190..b4072aff0b41 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -56,7 +56,8 @@ Suggests:
     testthat,
     igraph (>= 1.0.1),
     float,
-    titanic
+    titanic,
+    RhpcBLASctl
 Depends:
     R (>= 4.3.0)
 Imports:
diff --git a/R-package/R/xgb.DMatrix.save.R b/R-package/R/xgb.DMatrix.save.R
index ef4599d0ef95..243f43047b07 100644
--- a/R-package/R/xgb.DMatrix.save.R
+++ b/R-package/R/xgb.DMatrix.save.R
@@ -6,6 +6,7 @@
 #' @param fname the name of the file to write.
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 #' fname <- file.path(tempdir(), "xgb.DMatrix.data")
diff --git a/R-package/R/xgb.config.R b/R-package/R/xgb.config.R
index 3f3a9b1a7b3b..20b8aef90797 100644
--- a/R-package/R/xgb.config.R
+++ b/R-package/R/xgb.config.R
@@ -4,7 +4,14 @@
 #' values of one or more global-scope parameters. Use \code{xgb.get.config} to fetch the current
 #' values of all global-scope parameters (listed in
 #' \url{https://xgboost.readthedocs.io/en/stable/parameter.html}).
+#' @details
+#' Note that serialization-related functions might use a globally-configured number of threads,
+#' which is managed by the system's OpenMP (OMP) configuration instead. Typically, XGBoost methods
+#' accept an `nthreads` parameter, but some methods like `readRDS` might get executed before such
+#' parameter can be supplied.
 #'
+#' The number of OMP threads can in turn be configured for example through an environment variable
+#' `OMP_NUM_THREADS` (needs to be set before R is started), or through `RhpcBLASctl::omp_set_num_threads`.
 #' @rdname xgbConfig
 #' @title Set and get global configuration
 #' @name xgb.set.config, xgb.get.config
diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R
index 3a3d2c7dcbcb..2fa5bcb2f628 100644
--- a/R-package/R/xgb.dump.R
+++ b/R-package/R/xgb.dump.R
@@ -24,6 +24,7 @@
 #' as a \code{character} vector. Otherwise it will return \code{TRUE}.
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #' train <- agaricus.train
diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R
index 7d1eab7e9c34..4985f74b56c6 100644
--- a/R-package/R/xgb.load.R
+++ b/R-package/R/xgb.load.R
@@ -20,6 +20,7 @@
 #' \code{\link{xgb.save}}
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'
diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R
index e1a61d1965b9..91c545ff76fd 100644
--- a/R-package/R/xgb.save.R
+++ b/R-package/R/xgb.save.R
@@ -35,6 +35,7 @@
 #' \code{\link{xgb.load}}
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'
diff --git a/R-package/R/xgb.save.raw.R b/R-package/R/xgb.save.raw.R
index c124a752b02d..c04f06d9c941 100644
--- a/R-package/R/xgb.save.raw.R
+++ b/R-package/R/xgb.save.raw.R
@@ -12,6 +12,7 @@
 #' }
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'
diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R
index 31f79fb57be4..3dbbe0586f44 100644
--- a/R-package/demo/basic_walkthrough.R
+++ b/R-package/demo/basic_walkthrough.R
@@ -55,6 +55,8 @@ print(paste("test-error=", err))
 # save model to binary local file
 xgb.save(bst, "xgboost.model")
 # load binary model to R
+# Function doesn't take 'nthreads', but can be set like this:
+RhpcBLASctl::omp_set_num_threads(1)
 bst2 <- xgb.load("xgboost.model")
 pred2 <- predict(bst2, test$data)
 # pred2 should be identical to pred
diff --git a/R-package/man/xgb.DMatrix.save.Rd b/R-package/man/xgb.DMatrix.save.Rd
index d5c0563b37db..51643274d857 100644
--- a/R-package/man/xgb.DMatrix.save.Rd
+++ b/R-package/man/xgb.DMatrix.save.Rd
@@ -15,6 +15,7 @@ xgb.DMatrix.save(dmatrix, fname)
 Save xgb.DMatrix object to binary file
 }
 \examples{
+\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 data(agaricus.train, package='xgboost')
 dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 fname <- file.path(tempdir(), "xgb.DMatrix.data")
diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd
index 2cdb6b16acd8..6f97f69244b9 100644
--- a/R-package/man/xgb.dump.Rd
+++ b/R-package/man/xgb.dump.Rd
@@ -44,6 +44,7 @@ as a \code{character} vector. Otherwise it will return \code{TRUE}.
 Dump an xgboost model in text format.
 }
 \examples{
+\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 train <- agaricus.train
diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd
index 1a687317176f..1fbe0055ed9d 100644
--- a/R-package/man/xgb.load.Rd
+++ b/R-package/man/xgb.load.Rd
@@ -25,6 +25,7 @@ Note: a model saved as an R-object, has to be loaded using corresponding R-metho
 not \code{xgb.load}.
 }
 \examples{
+\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 
diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd
index 0db80a120c84..bcfbd0bb4520 100644
--- a/R-package/man/xgb.save.Rd
+++ b/R-package/man/xgb.save.Rd
@@ -41,6 +41,7 @@ how to persist models in a future-proof way, i.e. to make the model accessible i
 releases of XGBoost.
 }
 \examples{
+\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 
diff --git a/R-package/man/xgb.save.raw.Rd b/R-package/man/xgb.save.raw.Rd
index 15400bb1450e..6cdafd3d950c 100644
--- a/R-package/man/xgb.save.raw.Rd
+++ b/R-package/man/xgb.save.raw.Rd
@@ -21,6 +21,7 @@ xgb.save.raw(model, raw_format = "ubj")
 Save xgboost model from xgboost or xgb.train
 }
 \examples{
+\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 
diff --git a/R-package/man/xgbConfig.Rd b/R-package/man/xgbConfig.Rd
index 94b220c7785b..164c62ef45d0 100644
--- a/R-package/man/xgbConfig.Rd
+++ b/R-package/man/xgbConfig.Rd
@@ -25,6 +25,15 @@ values of one or more global-scope parameters. Use \code{xgb.get.config} to fetc
 values of all global-scope parameters (listed in
 \url{https://xgboost.readthedocs.io/en/stable/parameter.html}).
 }
+\details{
+Note that serialization-related functions might use a globally-configured number of threads,
+which is managed by the system's OpenMP (OMP) configuration instead. Typically, XGBoost methods
+accept an \code{nthreads} parameter, but some methods like \code{readRDS} might get executed before such
+parameter can be supplied.
+
+The number of OMP threads can in turn be configured for example through an environment variable
+\code{OMP_NUM_THREADS} (needs to be set before R is started), or through \code{RhpcBLASctl::omp_set_num_threads}.
+}
 \examples{
 # Set verbosity level to silent (0)
 xgb.set.config(verbosity = 0)
diff --git a/R-package/tests/helper_scripts/install_deps.R b/R-package/tests/helper_scripts/install_deps.R
index 3ae44f6b13f4..7a621798ab62 100644
--- a/R-package/tests/helper_scripts/install_deps.R
+++ b/R-package/tests/helper_scripts/install_deps.R
@@ -20,6 +20,7 @@ pkgs <- c(
   "igraph",
   "float",
   "titanic",
+  "RhpcBLASctl",
   ## imports
   "Matrix",
   "methods",
diff --git a/R-package/tests/testthat.R b/R-package/tests/testthat.R
index 3bb229e705c9..7cf711292c48 100644
--- a/R-package/tests/testthat.R
+++ b/R-package/tests/testthat.R
@@ -2,3 +2,4 @@ library(testthat)
 library(xgboost)
 
 test_check("xgboost", reporter = ProgressReporter)
+RhpcBLASctl::omp_set_num_threads(1)
diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd
index efafc624d40f..0a6432d5f9cf 100644
--- a/R-package/vignettes/xgboostPresentation.Rmd
+++ b/R-package/vignettes/xgboostPresentation.Rmd
@@ -496,6 +496,9 @@ An interesting test to see how identical our saved model is to the original one
 
 ```{r loadModel, message=F, warning=F}
 # load binary model to R
+# Note that the number of threads for 'xgb.load' is taken from global config,
+# can be modified like this:
+RhpcBLASctl::omp_set_num_threads(1)
 bst2 <- xgb.load(fname)
 xgb.parameters(bst2) <- list(nthread = 2)
 pred2 <- predict(bst2, test$data)
diff --git a/src/gbm/gbtree_model.cc b/src/gbm/gbtree_model.cc
index 14131865fe75..2edb456c95de 100644
--- a/src/gbm/gbtree_model.cc
+++ b/src/gbm/gbtree_model.cc
@@ -106,30 +106,13 @@ void GBTreeModel::Load(dmlc::Stream* fi) {
   Validate(*this);
 }
 
-namespace {
-std::int32_t IOThreads(Context const* ctx) {
-  CHECK(ctx);
-  std::int32_t n_threads = ctx->Threads();
-  // CRAN checks for number of threads used by examples, but we might not have the right
-  // number of threads when serializing/unserializing models as nthread is a booster
-  // parameter, which is only effective after booster initialization.
-  //
-  // The threshold ratio of CPU time to user time for R is 2.5, we set the number of
-  // threads to 2.
-#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
-  n_threads = std::min(2, n_threads);
-#endif
-  return n_threads;
-}
-}  // namespace
-
 void GBTreeModel::SaveModel(Json* p_out) const {
   auto& out = *p_out;
   CHECK_EQ(param.num_trees, static_cast<int>(trees.size()));
   out["gbtree_model_param"] = ToJson(param);
   std::vector<Json> trees_json(trees.size());
 
-  common::ParallelFor(trees.size(), IOThreads(ctx_), [&](auto t) {
+  common::ParallelFor(trees.size(), ctx_->Threads(), [&](auto t) {
     auto const& tree = trees[t];
     Json jtree{Object{}};
     tree->SaveModel(&jtree);
@@ -167,7 +150,7 @@ void GBTreeModel::LoadModel(Json const& in) {
   CHECK_EQ(tree_info_json.size(), param.num_trees);
   tree_info.resize(param.num_trees);
 
-  common::ParallelFor(param.num_trees, IOThreads(ctx_), [&](auto t) {
+  common::ParallelFor(param.num_trees, ctx_->Threads(), [&](auto t) {
     auto tree_id = get<Integer const>(trees_json[t]["id"]);
     trees.at(tree_id).reset(new RegTree{});
     trees[tree_id]->LoadModel(trees_json[t]);

From d37b83e8d950edf22313abd16e629fb726e0db7d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 20 Feb 2024 22:13:51 +0800
Subject: [PATCH 6/8] Fix UBJSON with boolean value. (#10054)

---
 src/common/json.cc            |  2 +-
 tests/cpp/common/test_json.cc | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/common/json.cc b/src/common/json.cc
index de9a89f78df8..21be2a5bc674 100644
--- a/src/common/json.cc
+++ b/src/common/json.cc
@@ -791,7 +791,7 @@ Json UBJReader::Parse() {
         return Json{JsonBoolean{true}};
       }
       case 'F': {
-        return Json{JsonBoolean{true}};
+        return Json{JsonBoolean{false}};
       }
       case 'd': {
         auto v = this->ReadPrimitive<float>();
diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc
index d361552ce1c1..155cf04adf9a 100644
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -677,6 +677,21 @@ TEST(UBJson, Basic) {
     ASSERT_FLOAT_EQ(3.14, get<Number>(get<Array>(ret["test"])[1]));
     ASSERT_FLOAT_EQ(2.71, get<Number>(get<Array>(ret["test"])[0]));
   }
+  {
+    // boolean
+    Json boolean{Object{}};
+    boolean["foo"] = Boolean{false};
+    std::vector<char> out;
+    Json::Dump(boolean, &out, std::ios::binary);
+    auto loaded = Json::Load(StringView{out.data(), out.size()}, std::ios::binary);
+
+    ASSERT_EQ(boolean, loaded);
+
+    boolean["foo"] = Boolean{true};
+    Json::Dump(boolean, &out, std::ios::binary);
+    loaded = Json::Load(StringView{out.data(), out.size()}, std::ios::binary);
+    ASSERT_EQ(boolean, loaded);
+  }
 }
 
 TEST(Json, TypeCheck) {

From 69a17d51141bd2457142c8309f57494ab2d97562 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 20 Feb 2024 22:34:22 +0800
Subject: [PATCH 7/8] Fix with None input. (#10052)

---
 python-package/xgboost/core.py     | 19 ++++++++++---------
 python-package/xgboost/data.py     |  4 ++--
 tests/python-gpu/test_from_cupy.py |  5 ++++-
 tests/python/test_with_sklearn.py  | 13 +++++++++++++
 4 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 27331d3de5ca..f1907822470c 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -861,9 +861,9 @@ def __init__(
         self.nthread = nthread if nthread is not None else -1
         self.silent = silent
 
-        # force into void_p, mac need to pass things in as void_p
-        if data is None:
-            self.handle: Optional[ctypes.c_void_p] = None
+        if isinstance(data, ctypes.c_void_p):
+            # Used for constructing DMatrix slice.
+            self.handle = data
             return
 
         from .data import _is_iter, dispatch_data_backend
@@ -925,9 +925,10 @@ def _init_from_iter(self, iterator: DataIter, enable_categorical: bool) -> None:
         self.handle = handle
 
     def __del__(self) -> None:
-        if hasattr(self, "handle") and self.handle:
+        if hasattr(self, "handle"):
+            assert self.handle is not None
             _check_call(_LIB.XGDMatrixFree(self.handle))
-            self.handle = None
+            del self.handle
 
     @_deprecate_positional_args
     def set_info(
@@ -1281,19 +1282,19 @@ def slice(
         """
         from .data import _maybe_np_slice
 
-        res = DMatrix(None)
-        res.handle = ctypes.c_void_p()
+        handle = ctypes.c_void_p()
+
         rindex = _maybe_np_slice(rindex, dtype=np.int32)
         _check_call(
             _LIB.XGDMatrixSliceDMatrixEx(
                 self.handle,
                 c_array(ctypes.c_int, rindex),
                 c_bst_ulong(len(rindex)),
-                ctypes.byref(res.handle),
+                ctypes.byref(handle),
                 ctypes.c_int(1 if allow_groups else 0),
             )
         )
-        return res
+        return DMatrix(handle)
 
     @property
     def feature_names(self) -> Optional[FeatureNames]:
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 49a0f43b7c83..07a08dc5f0b2 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -1053,10 +1053,10 @@ def _is_dlpack(data: DataType) -> bool:
 
 
 def _transform_dlpack(data: DataType) -> bool:
-    from cupy import fromDlpack  # pylint: disable=E0401
+    from cupy import from_dlpack  # pylint: disable=E0401
 
     assert "used_dltensor" not in str(data)
-    data = fromDlpack(data)
+    data = from_dlpack(data)
     return data
 
 
diff --git a/tests/python-gpu/test_from_cupy.py b/tests/python-gpu/test_from_cupy.py
index 79814a1bb430..85d54c78dbff 100644
--- a/tests/python-gpu/test_from_cupy.py
+++ b/tests/python-gpu/test_from_cupy.py
@@ -202,7 +202,10 @@ def test_dlpack_device_dmat(self):
         n = 100
         X = cp.random.random((n, 2))
         m = xgb.QuantileDMatrix(X.toDlpack())
-        with pytest.raises(xgb.core.XGBoostError):
+
+        with pytest.raises(
+            xgb.core.XGBoostError, match="Slicing DMatrix is not supported"
+        ):
             m.slice(rindex=[0, 1, 2])
 
     @pytest.mark.skipif(**tm.no_cupy())
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 344628e4f605..ede70bb8bb90 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1456,3 +1456,16 @@ def test_intercept() -> None:
     result = reg.intercept_
     assert result.dtype == np.float32
     assert result[0] < 0.5
+
+
+def test_fit_none() -> None:
+    with pytest.raises(TypeError, match="NoneType"):
+        xgb.XGBClassifier().fit(None, [0, 1])
+
+    X = rng.normal(size=4).reshape(2, 2)
+
+    with pytest.raises(ValueError, match="Invalid classes"):
+        xgb.XGBClassifier().fit(X, None)
+
+    with pytest.raises(ValueError, match="labels"):
+        xgb.XGBRegressor().fit(X, None)

From 8ea705e4d55bb08d3f4d9dcfdab39167169447e8 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 21 Feb 2024 00:43:14 +0800
Subject: [PATCH 8/8] Support sample weight in sklearn custom objective.
 (#10050)

---
 python-package/xgboost/core.py                |   9 +-
 python-package/xgboost/sklearn.py             | 210 ++++++++++++------
 python-package/xgboost/testing/__init__.py    |   7 +-
 tests/ci_build/lint_python.py                 |   1 +
 tests/python/test_with_sklearn.py             |   6 +
 .../test_with_dask/test_with_dask.py          |  15 +-
 6 files changed, 179 insertions(+), 69 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index f1907822470c..36e4bdcf0d2d 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -804,10 +804,11 @@ def __init__(
 
             Otherwise, one can pass a list-like input with the same length as number
             of columns in `data`, with the following possible values:
-             - "c", which represents categorical columns.
-             - "q", which represents numeric columns.
-             - "int", which represents integer columns.
-             - "i", which represents boolean columns.
+
+            - "c", which represents categorical columns.
+            - "q", which represents numeric columns.
+            - "int", which represents integer columns.
+            - "i", which represents boolean columns.
 
             Note that, while categorical types are treated differently from
             the rest for model fitting purposes, the other types do not influence
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 5d651948ce4f..c4713a9e49c7 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -5,12 +5,14 @@
 import os
 import warnings
 from concurrent.futures import ThreadPoolExecutor
+from inspect import signature
 from typing import (
     Any,
     Callable,
     Dict,
     List,
     Optional,
+    Protocol,
     Sequence,
     Tuple,
     Type,
@@ -67,14 +69,20 @@ def _can_use_qdm(tree_method: Optional[str]) -> bool:
     return tree_method in ("hist", "gpu_hist", None, "auto")
 
 
-SklObjective = Optional[
-    Union[str, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]]
-]
+class _SklObjWProto(Protocol):  # pylint: disable=too-few-public-methods
+    def __call__(
+        self,
+        y_true: ArrayLike,
+        y_pred: ArrayLike,
+        sample_weight: Optional[ArrayLike],
+    ) -> Tuple[ArrayLike, ArrayLike]: ...
+
+
+_SklObjProto = Callable[[ArrayLike, ArrayLike], Tuple[np.ndarray, np.ndarray]]
+SklObjective = Optional[Union[str, _SklObjWProto, _SklObjProto]]
 
 
-def _objective_decorator(
-    func: Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]
-) -> Objective:
+def _objective_decorator(func: Union[_SklObjWProto, _SklObjProto]) -> Objective:
     """Decorate an objective function
 
     Converts an objective function using the typical sklearn metrics
@@ -89,6 +97,8 @@ def _objective_decorator(
             The target values
         y_pred: array_like of shape [n_samples]
             The predicted values
+        sample_weight :
+            Optional sample weight, None or a ndarray.
 
     Returns
     -------
@@ -103,10 +113,25 @@ def _objective_decorator(
             ``dmatrix.get_label()``
     """
 
+    parameters = signature(func).parameters
+    supports_sw = "sample_weight" in parameters
+
     def inner(preds: np.ndarray, dmatrix: DMatrix) -> Tuple[np.ndarray, np.ndarray]:
-        """internal function"""
+        """Internal function."""
+        sample_weight = dmatrix.get_weight()
         labels = dmatrix.get_label()
-        return func(labels, preds)
+
+        if sample_weight.size > 0 and not supports_sw:
+            raise ValueError(
+                "Custom objective doesn't have the `sample_weight` parameter while"
+                " sample_weight is used."
+            )
+        if sample_weight.size > 0:
+            fnw = cast(_SklObjWProto, func)
+            return fnw(labels, preds, sample_weight=sample_weight)
+
+        fn = cast(_SklObjProto, func)
+        return fn(labels, preds)
 
     return inner
 
@@ -172,75 +197,121 @@ def task(i: int) -> float:
     return inner
 
 
-__estimator_doc = """
-    n_estimators : Optional[int]
+__estimator_doc = f"""
+    n_estimators : {Optional[int]}
         Number of gradient boosted trees.  Equivalent to number of boosting
         rounds.
 """
 
 __model_doc = f"""
-    max_depth :  Optional[int]
+    max_depth :  {Optional[int]}
+
         Maximum tree depth for base learners.
-    max_leaves :
+
+    max_leaves : {Optional[int]}
+
         Maximum number of leaves; 0 indicates no limit.
-    max_bin :
+
+    max_bin : {Optional[int]}
+
         If using histogram-based algorithm, maximum number of bins per feature
-    grow_policy :
-        Tree growing policy. 0: favor splitting at nodes closest to the node, i.e. grow
-        depth-wise. 1: favor splitting at nodes with highest loss change.
-    learning_rate : Optional[float]
+
+    grow_policy : {Optional[str]}
+
+        Tree growing policy.
+
+        - depthwise: Favors splitting at nodes closest to the node,
+        - lossguide: Favors splitting at nodes with highest loss change.
+
+    learning_rate : {Optional[float]}
+
         Boosting learning rate (xgb's "eta")
-    verbosity : Optional[int]
+
+    verbosity : {Optional[int]}
+
         The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
 
     objective : {SklObjective}
 
         Specify the learning task and the corresponding learning objective or a custom
-        objective function to be used. For custom objective, see
-        :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more
-        information.
+        objective function to be used.
+
+        For custom objective, see :doc:`/tutorials/custom_metric_obj` and
+        :ref:`custom-obj-metric` for more information, along with the end note for
+        function signatures.
+
+    booster: {Optional[str]}
+
+        Specify which booster to use: ``gbtree``, ``gblinear`` or ``dart``.
+
+    tree_method : {Optional[str]}
 
-    booster: Optional[str]
-        Specify which booster to use: `gbtree`, `gblinear` or `dart`.
-    tree_method: Optional[str]
         Specify which tree method to use.  Default to auto.  If this parameter is set to
         default, XGBoost will choose the most conservative option available.  It's
         recommended to study this option from the parameters document :doc:`tree method
         </treemethod>`
-    n_jobs : Optional[int]
+
+    n_jobs : {Optional[int]}
+
         Number of parallel threads used to run xgboost.  When used with other
         Scikit-Learn algorithms like grid search, you may choose which algorithm to
         parallelize and balance the threads.  Creating thread contention will
         significantly slow down both algorithms.
-    gamma : Optional[float]
-        (min_split_loss) Minimum loss reduction required to make a further partition on a
-        leaf node of the tree.
-    min_child_weight : Optional[float]
+
+    gamma : {Optional[float]}
+
+        (min_split_loss) Minimum loss reduction required to make a further partition on
+        a leaf node of the tree.
+
+    min_child_weight : {Optional[float]}
+
         Minimum sum of instance weight(hessian) needed in a child.
-    max_delta_step : Optional[float]
+
+    max_delta_step : {Optional[float]}
+
         Maximum delta step we allow each tree's weight estimation to be.
-    subsample : Optional[float]
+
+    subsample : {Optional[float]}
+
         Subsample ratio of the training instance.
-    sampling_method :
+
+    sampling_method : {Optional[str]}
+
         Sampling method. Used only by the GPU version of ``hist`` tree method.
-          - ``uniform``: select random training instances uniformly.
-          - ``gradient_based`` select random training instances with higher probability
+
+        - ``uniform``: Select random training instances uniformly.
+        - ``gradient_based``: Select random training instances with higher probability
             when the gradient and hessian are larger. (cf. CatBoost)
-    colsample_bytree : Optional[float]
+
+    colsample_bytree : {Optional[float]}
+
         Subsample ratio of columns when constructing each tree.
-    colsample_bylevel : Optional[float]
+
+    colsample_bylevel : {Optional[float]}
+
         Subsample ratio of columns for each level.
-    colsample_bynode : Optional[float]
+
+    colsample_bynode : {Optional[float]}
+
         Subsample ratio of columns for each split.
-    reg_alpha : Optional[float]
+
+    reg_alpha : {Optional[float]}
+
         L1 regularization term on weights (xgb's alpha).
-    reg_lambda : Optional[float]
+
+    reg_lambda : {Optional[float]}
+
         L2 regularization term on weights (xgb's lambda).
-    scale_pos_weight : Optional[float]
+
+    scale_pos_weight : {Optional[float]}
         Balancing of positive and negative weights.
-    base_score : Optional[float]
+
+    base_score : {Optional[float]}
+
         The initial prediction score of all instances, global bias.
-    random_state : Optional[Union[numpy.random.RandomState, numpy.random.Generator, int]]
+
+    random_state : {Optional[Union[np.random.RandomState, np.random.Generator, int]]}
+
         Random number seed.
 
         .. note::
@@ -248,34 +319,44 @@ def task(i: int) -> float:
            Using gblinear booster with shotgun updater is nondeterministic as
            it uses Hogwild algorithm.
 
-    missing : float, default np.nan
-        Value in the data which needs to be present as a missing value.
-    num_parallel_tree: Optional[int]
+    missing : float
+
+        Value in the data which needs to be present as a missing value. Default to
+        :py:data:`numpy.nan`.
+
+    num_parallel_tree: {Optional[int]}
+
         Used for boosting random forest.
-    monotone_constraints : Optional[Union[Dict[str, int], str]]
+
+    monotone_constraints : {Optional[Union[Dict[str, int], str]]}
+
         Constraint of variable monotonicity.  See :doc:`tutorial </tutorials/monotonic>`
         for more information.
-    interaction_constraints : Optional[Union[str, List[Tuple[str]]]]
+
+    interaction_constraints : {Optional[Union[str, List[Tuple[str]]]]}
+
         Constraints for interaction representing permitted interactions.  The
         constraints must be specified in the form of a nested list, e.g. ``[[0, 1], [2,
         3, 4]]``, where each inner list is a group of indices of features that are
         allowed to interact with each other.  See :doc:`tutorial
         </tutorials/feature_interaction_constraint>` for more information
-    importance_type: Optional[str]
+
+    importance_type: {Optional[str]}
+
         The feature importance type for the feature_importances\\_ property:
 
         * For tree model, it's either "gain", "weight", "cover", "total_gain" or
           "total_cover".
-        * For linear model, only "weight" is defined and it's the normalized coefficients
-          without bias.
+        * For linear model, only "weight" is defined and it's the normalized
+          coefficients without bias.
 
-    device : Optional[str]
+    device : {Optional[str]}
 
         .. versionadded:: 2.0.0
 
         Device ordinal, available options are `cpu`, `cuda`, and `gpu`.
 
-    validate_parameters : Optional[bool]
+    validate_parameters : {Optional[bool]}
 
         Give warnings for unknown parameter.
 
@@ -283,14 +364,14 @@ def task(i: int) -> float:
 
         See the same parameter of :py:class:`DMatrix` for details.
 
-    feature_types : Optional[FeatureTypes]
+    feature_types : {Optional[FeatureTypes]}
 
         .. versionadded:: 1.7.0
 
         Used for specifying feature types without constructing a dataframe. See
         :py:class:`DMatrix` for details.
 
-    max_cat_to_onehot : Optional[int]
+    max_cat_to_onehot : {Optional[int]}
 
         .. versionadded:: 1.6.0
 
@@ -303,7 +384,7 @@ def task(i: int) -> float:
         categorical feature support. See :doc:`Categorical Data
         </tutorials/categorical>` and :ref:`cat-param` for details.
 
-    max_cat_threshold : Optional[int]
+    max_cat_threshold : {Optional[int]}
 
         .. versionadded:: 1.7.0
 
@@ -314,7 +395,7 @@ def task(i: int) -> float:
         needs to be set to have categorical feature support. See :doc:`Categorical Data
         </tutorials/categorical>` and :ref:`cat-param` for details.
 
-    multi_strategy : Optional[str]
+    multi_strategy : {Optional[str]}
 
         .. versionadded:: 2.0.0
 
@@ -327,7 +408,7 @@ def task(i: int) -> float:
         - ``one_output_per_tree``: One model for each target.
         - ``multi_output_tree``:  Use multi-target trees.
 
-    eval_metric : Optional[Union[str, List[str], Callable]]
+    eval_metric : {Optional[Union[str, List[str], Callable]]}
 
         .. versionadded:: 1.6.0
 
@@ -360,7 +441,7 @@ def task(i: int) -> float:
             )
             reg.fit(X, y, eval_set=[(X, y)])
 
-    early_stopping_rounds : Optional[int]
+    early_stopping_rounds : {Optional[int]}
 
         .. versionadded:: 1.6.0
 
@@ -383,7 +464,8 @@ def task(i: int) -> float:
           early stopping.  If there's more than one metric in **eval_metric**, the last
           metric will be used for early stopping.
 
-    callbacks : Optional[List[TrainingCallback]]
+    callbacks : {Optional[List[TrainingCallback]]}
+
         List of callback functions that are applied at end of each iteration.
         It is possible to use predefined callbacks by using
         :ref:`Callback API <callback_api>`.
@@ -402,7 +484,8 @@ def task(i: int) -> float:
                 reg = xgboost.XGBRegressor(**params, callbacks=callbacks)
                 reg.fit(X, y)
 
-    kwargs : dict, optional
+    kwargs : {Optional[Any]}
+
         Keyword arguments for XGBoost Booster object.  Full documentation of parameters
         can be found :doc:`here </parameter>`.
         Attempting to set a parameter via the constructor args and \\*\\*kwargs
@@ -419,13 +502,16 @@ def task(i: int) -> float:
         .. note::  Custom objective function
 
             A custom objective function can be provided for the ``objective``
-            parameter. In this case, it should have the signature
-            ``objective(y_true, y_pred) -> grad, hess``:
+            parameter. In this case, it should have the signature ``objective(y_true,
+            y_pred) -> [grad, hess]`` or ``objective(y_true, y_pred, *, sample_weight)
+            -> [grad, hess]``:
 
             y_true: array_like of shape [n_samples]
                 The target values
             y_pred: array_like of shape [n_samples]
                 The predicted values
+            sample_weight :
+                Optional sample weights.
 
             grad: array_like of shape [n_samples]
                 The value of the gradient for each sample point.
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 389066f0e7d1..f7d9510faea6 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -815,10 +815,15 @@ def objective(
     return objective
 
 
-def ls_obj(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+def ls_obj(
+    y_true: np.ndarray, y_pred: np.ndarray, sample_weight: Optional[np.ndarray] = None
+) -> Tuple[np.ndarray, np.ndarray]:
     """Least squared error."""
     grad = y_pred - y_true
     hess = np.ones(len(y_true))
+    if sample_weight is not None:
+        grad *= sample_weight
+        hess *= sample_weight
     return grad, hess
 
 
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 91b748b4c44c..741ef7558f13 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -100,6 +100,7 @@ class LintersPaths:
         # demo
         "demo/json-model/json_parser.py",
         "demo/guide-python/external_memory.py",
+        "demo/guide-python/sklearn_examples.py",
         "demo/guide-python/continuation.py",
         "demo/guide-python/callbacks.py",
         "demo/guide-python/cat_in_the_dat.py",
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index ede70bb8bb90..5074707241ba 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -517,6 +517,12 @@ def test_regression_with_custom_objective():
         labels = y[test_index]
     assert mean_squared_error(preds, labels) < 25
 
+    w = rng.uniform(low=0.0, high=1.0, size=X.shape[0])
+    reg = xgb.XGBRegressor(objective=tm.ls_obj, n_estimators=25)
+    reg.fit(X, y, sample_weight=w)
+    y_pred = reg.predict(X)
+    assert mean_squared_error(y_true=y, y_pred=y_pred, sample_weight=w) < 25
+
     # Test that the custom objective function is actually used
     class XGBCustomObjectiveException(Exception):
         pass
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index fdf0d64c4129..ffea1d058bf9 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -1750,9 +1750,20 @@ def sqr(
             )
             tm.non_increasing(results_native["validation_0"]["rmse"])
 
+            reg = xgb.dask.DaskXGBRegressor(
+                n_estimators=rounds, objective=tm.ls_obj, tree_method="hist"
+            )
+            rng = da.random.RandomState(1994)
+            w = rng.uniform(low=0.0, high=1.0, size=y.shape[0])
+            reg.fit(
+                X, y, sample_weight=w, eval_set=[(X, y)], sample_weight_eval_set=[w]
+            )
+            results_custom = reg.evals_result()
+            tm.non_increasing(results_custom["validation_0"]["rmse"])
+
     def test_no_duplicated_partition(self) -> None:
-        """Assert each worker has the correct amount of data, and DMatrix initialization doesn't
-        generate unnecessary copies of data.
+        """Assert each worker has the correct amount of data, and DMatrix initialization
+        doesn't generate unnecessary copies of data.
 
         """
         with LocalCluster(n_workers=2, dashboard_address=":0") as cluster: