Merge branch 'dmlc:master' into SecureBoost

ZiyueXu77 · Feb 20, 2024 · 04cd1cb · 04cd1cb
2 parents 967e307 + 8ea705e
commit 04cd1cb
Show file tree

Hide file tree

Showing 42 changed files with 912 additions and 168 deletions.
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
@@ -56,7 +56,8 @@ Suggests:
     testthat,
     igraph (>= 1.0.1),
     float,
-    titanic
+    titanic,
+    RhpcBLASctl
 Depends:
     R (>= 4.3.0)
 Imports:

diff --git a/R-package/R/xgb.DMatrix.save.R b/R-package/R/xgb.DMatrix.save.R
@@ -6,6 +6,7 @@
 #' @param fname the name of the file to write.
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 #' fname <- file.path(tempdir(), "xgb.DMatrix.data")

diff --git a/R-package/R/xgb.config.R b/R-package/R/xgb.config.R
@@ -4,7 +4,14 @@
 #' values of one or more global-scope parameters. Use \code{xgb.get.config} to fetch the current
 #' values of all global-scope parameters (listed in
 #' \url{https://xgboost.readthedocs.io/en/stable/parameter.html}).
+#' @details
+#' Note that serialization-related functions might use a globally-configured number of threads,
+#' which is managed by the system's OpenMP (OMP) configuration instead. Typically, XGBoost methods
+#' accept an `nthreads` parameter, but some methods like `readRDS` might get executed before such
+#' parameter can be supplied.
 #'
+#' The number of OMP threads can in turn be configured for example through an environment variable
+#' `OMP_NUM_THREADS` (needs to be set before R is started), or through `RhpcBLASctl::omp_set_num_threads`.
 #' @rdname xgbConfig
 #' @title Set and get global configuration
 #' @name xgb.set.config, xgb.get.config

diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R
@@ -24,6 +24,7 @@
 #' as a \code{character} vector. Otherwise it will return \code{TRUE}.
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #' train <- agaricus.train

diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R
@@ -20,6 +20,7 @@
 #' \code{\link{xgb.save}}
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'

diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R
@@ -35,6 +35,7 @@
 #' \code{\link{xgb.load}}
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'

diff --git a/R-package/R/xgb.save.raw.R b/R-package/R/xgb.save.raw.R
@@ -12,6 +12,7 @@
 #' }
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'

diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R
@@ -55,6 +55,8 @@ print(paste("test-error=", err))
 # save model to binary local file
 xgb.save(bst, "xgboost.model")
 # load binary model to R
+# Function doesn't take 'nthreads', but can be set like this:
+RhpcBLASctl::omp_set_num_threads(1)
 bst2 <- xgb.load("xgboost.model")
 pred2 <- predict(bst2, test$data)
 # pred2 should be identical to pred

diff --git a/R-package/man/xgb.DMatrix.save.Rd b/R-package/man/xgb.DMatrix.save.Rd
diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd
diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd
diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd
diff --git a/R-package/man/xgb.save.raw.Rd b/R-package/man/xgb.save.raw.Rd
diff --git a/R-package/man/xgbConfig.Rd b/R-package/man/xgbConfig.Rd
diff --git a/R-package/tests/helper_scripts/install_deps.R b/R-package/tests/helper_scripts/install_deps.R
@@ -20,6 +20,7 @@ pkgs <- c(
   "igraph",
   "float",
   "titanic",
+  "RhpcBLASctl",
   ## imports
   "Matrix",
   "methods",

diff --git a/R-package/tests/testthat.R b/R-package/tests/testthat.R
@@ -2,3 +2,4 @@ library(testthat)
 library(xgboost)
 
 test_check("xgboost", reporter = ProgressReporter)
+RhpcBLASctl::omp_set_num_threads(1)
diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd
@@ -496,6 +496,9 @@ An interesting test to see how identical our saved model is to the original one
 
 ```{r loadModel, message=F, warning=F}
 # load binary model to R
+# Note that the number of threads for 'xgb.load' is taken from global config,
+# can be modified like this:
+RhpcBLASctl::omp_set_num_threads(1)
 bst2 <- xgb.load(fname)
 xgb.parameters(bst2) <- list(nthread = 2)
 pred2 <- predict(bst2, test$data)

diff --git a/doc/python/python_intro.rst b/doc/python/python_intro.rst
@@ -63,7 +63,7 @@ The input data is stored in a :py:class:`DMatrix <xgboost.DMatrix>` object. For
 
   .. code-block:: python
 
-    dtrain = xgb.DMatrix('train.svm.txt')
+    dtrain = xgb.DMatrix('train.svm.txt?format=libsvm')
     dtrain.save_binary('train.buffer')
 
 * Missing values can be replaced by a default value in the :py:class:`DMatrix <xgboost.DMatrix>` constructor:
@@ -86,7 +86,7 @@ to number of groups.
 
   .. code-block:: python
 
-    dtrain = xgb.DMatrix('train.svm.txt')
+    dtrain = xgb.DMatrix('train.svm.txt?format=libsvm')
     dtest = xgb.DMatrix('test.svm.buffer')
 
   The parser in XGBoost has limited functionality. When using Python interface, it's
@@ -176,7 +176,6 @@ Support Matrix
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
 | pyarrow.Table           | NPA       | NPA               | NPA       | NPA       | NPA                | NPA         |
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
 | _\_array\_\_            | NPA       | F                 | NPA       | NPA       | H                  |             |
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
 | Others                  | SciCSR    | F                 |           | F         | F                  |             |
@@ -240,7 +239,7 @@ A saved model can be loaded as follows:
 .. code-block:: python
 
   bst = xgb.Booster({'nthread': 4})  # init model
-  bst.load_model('model.bin')  # load data
+  bst.load_model('model.bin')  # load model data
 
 Methods including `update` and `boost` from `xgboost.Booster` are designed for
 internal usage only.  The wrapper function `xgboost.train` does some

diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt
@@ -1,10 +1,7 @@
 if(PLUGIN_SYCL)
   set(CMAKE_CXX_COMPILER "icpx")
-  add_library(plugin_sycl OBJECT
-    ${xgboost_SOURCE_DIR}/plugin/sycl/objective/regression_obj.cc
-    ${xgboost_SOURCE_DIR}/plugin/sycl/objective/multiclass_obj.cc
-    ${xgboost_SOURCE_DIR}/plugin/sycl/device_manager.cc
-    ${xgboost_SOURCE_DIR}/plugin/sycl/predictor/predictor.cc)
+  file(GLOB_RECURSE SYCL_SOURCES "sycl/*.cc")
+  add_library(plugin_sycl OBJECT ${SYCL_SOURCES})
   target_include_directories(plugin_sycl
     PRIVATE
     ${xgboost_SOURCE_DIR}/include

diff --git a/plugin/sycl/data.h b/plugin/sycl/data.h
@@ -26,8 +26,13 @@
 
 namespace xgboost {
 namespace sycl {
-enum class MemoryType { shared, on_device};
+template <typename T>
+using AtomicRef = ::sycl::atomic_ref<T,
+                                    ::sycl::memory_order::relaxed,
+                                    ::sycl::memory_scope::device,
+                                    ::sycl::access::address_space::ext_intel_global_device_space>;
 
+enum class MemoryType { shared, on_device};
 
 template <typename T>
 class USMDeleter {

diff --git a/plugin/sycl/data/gradient_index.cc b/plugin/sycl/data/gradient_index.cc
@@ -0,0 +1,177 @@
+/*!
+ * Copyright 2017-2024 by Contributors
+ * \file gradient_index.cc
+ */
+#include <vector>
+#include <limits>
+#include <algorithm>
+
+#include "gradient_index.h"
+
+#include <CL/sycl.hpp>
+
+namespace xgboost {
+namespace sycl {
+namespace common {
+
+uint32_t SearchBin(const bst_float* cut_values, const uint32_t* cut_ptrs, Entry const& e) {
+  auto beg = cut_ptrs[e.index];
+  auto end = cut_ptrs[e.index + 1];
+  auto it = std::upper_bound(cut_values + beg, cut_values + end, e.fvalue);
+  uint32_t idx = it - cut_values;
+  if (idx == end) {
+    idx -= 1;
+  }
+  return idx;
+}
+
+template <typename BinIdxType>
+void mergeSort(BinIdxType* begin, BinIdxType* end, BinIdxType* buf) {
+  const size_t total_len = end - begin;
+  for (size_t block_len = 1; block_len < total_len; block_len <<= 1) {
+    for (size_t cur_block = 0; cur_block + block_len < total_len; cur_block += 2 * block_len) {
+      size_t start = cur_block;
+      size_t mid = start + block_len;
+      size_t finish = mid + block_len < total_len ? mid + block_len : total_len;
+      size_t left_pos = start;
+      size_t right_pos = mid;
+      size_t pos = start;
+      while (left_pos < mid || right_pos < finish) {
+        if (left_pos < mid && (right_pos == finish || begin[left_pos] < begin[right_pos])) {
+          buf[pos++] = begin[left_pos++];
+        } else {
+          buf[pos++] = begin[right_pos++];
+        }
+      }
+      for (size_t i = start; i < finish; i++) begin[i] = buf[i];
+    }
+  }
+}
+
+template <typename BinIdxType>
+void GHistIndexMatrix::SetIndexData(::sycl::queue qu,
+                                    BinIdxType* index_data,
+                                    const DeviceMatrix &dmat,
+                                    size_t nbins,
+                                    size_t row_stride,
+                                    uint32_t* offsets) {
+  if (nbins == 0) return;
+  const xgboost::Entry *data_ptr = dmat.data.DataConst();
+  const bst_row_t *offset_vec = dmat.row_ptr.DataConst();
+  const size_t num_rows = dmat.row_ptr.Size() - 1;
+  const bst_float* cut_values = cut_device.Values().DataConst();
+  const uint32_t* cut_ptrs = cut_device.Ptrs().DataConst();
+  size_t* hit_count_ptr = hit_count_buff.Data();
+
+  // Sparse case only
+  if (!offsets) {
+    // sort_buff has type uint8_t
+    sort_buff.Resize(&qu, num_rows * row_stride * sizeof(BinIdxType));
+  }
+  BinIdxType* sort_data = reinterpret_cast<BinIdxType*>(sort_buff.Data());
+
+  auto event = qu.submit([&](::sycl::handler& cgh) {
+    cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::item<1> pid) {
+      const size_t i = pid.get_id(0);
+      const size_t ibegin = offset_vec[i];
+      const size_t iend = offset_vec[i + 1];
+      const size_t size = iend - ibegin;
+      const size_t start = i * row_stride;
+      for (bst_uint j = 0; j < size; ++j) {
+        uint32_t idx = SearchBin(cut_values, cut_ptrs, data_ptr[ibegin + j]);
+        index_data[start + j] = offsets ? idx - offsets[j] : idx;
+        AtomicRef<size_t> hit_count_ref(hit_count_ptr[idx]);
+        hit_count_ref.fetch_add(1);
+      }
+      if (!offsets) {
+        // Sparse case only
+        mergeSort<BinIdxType>(index_data + start, index_data + start + size, sort_data + start);
+        for (bst_uint j = size; j < row_stride; ++j) {
+          index_data[start + j] = nbins;
+        }
+      }
+    });
+  });
+  qu.memcpy(hit_count.data(), hit_count_ptr, nbins * sizeof(size_t), event);
+  qu.wait();
+}
+
+void GHistIndexMatrix::ResizeIndex(size_t n_index, bool isDense) {
+  if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
+    index.SetBinTypeSize(BinTypeSize::kUint8BinsTypeSize);
+    index.Resize((sizeof(uint8_t)) * n_index);
+  } else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max())  &&
+    max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) && isDense) {
+    index.SetBinTypeSize(BinTypeSize::kUint16BinsTypeSize);
+    index.Resize((sizeof(uint16_t)) * n_index);
+  } else {
+    index.SetBinTypeSize(BinTypeSize::kUint32BinsTypeSize);
+    index.Resize((sizeof(uint32_t)) * n_index);
+  }
+}
+
+void GHistIndexMatrix::Init(::sycl::queue qu,
+                            Context const * ctx,
+                            const DeviceMatrix& p_fmat_device,
+                            int max_bins) {
+  nfeatures = p_fmat_device.p_mat->Info().num_col_;
+
+  cut = xgboost::common::SketchOnDMatrix(ctx, p_fmat_device.p_mat, max_bins);
+  cut_device.Init(qu, cut);
+
+  max_num_bins = max_bins;
+  const uint32_t nbins = cut.Ptrs().back();
+  this->nbins = nbins;
+  hit_count.resize(nbins, 0);
+  hit_count_buff.Resize(&qu, nbins, 0);
+
+  this->p_fmat = p_fmat_device.p_mat;
+  const bool isDense = p_fmat_device.p_mat->IsDense();
+  this->isDense_ = isDense;
+
+  index.setQueue(qu);
+
+  row_stride = 0;
+  for (const auto& batch : p_fmat_device.p_mat->GetBatches<SparsePage>()) {
+    const auto& row_offset = batch.offset.ConstHostVector();
+    for (auto i = 1ull; i < row_offset.size(); i++) {
+      row_stride = std::max(row_stride, static_cast<size_t>(row_offset[i] - row_offset[i - 1]));
+    }
+  }
+
+  const size_t n_offsets = cut_device.Ptrs().Size() - 1;
+  const size_t n_rows = p_fmat_device.row_ptr.Size() - 1;
+  const size_t n_index = n_rows * row_stride;
+  ResizeIndex(n_index, isDense);
+
+  CHECK_GT(cut_device.Values().Size(), 0U);
+
+  uint32_t* offsets = nullptr;
+  if (isDense) {
+    index.ResizeOffset(n_offsets);
+    offsets = index.Offset();
+    qu.memcpy(offsets, cut_device.Ptrs().DataConst(),
+              sizeof(uint32_t) * n_offsets).wait_and_throw();
+  }
+
+  if (isDense) {
+    BinTypeSize curent_bin_size = index.GetBinTypeSize();
+    if (curent_bin_size == BinTypeSize::kUint8BinsTypeSize) {
+      SetIndexData(qu, index.data<uint8_t>(), p_fmat_device, nbins, row_stride, offsets);
+
+    } else if (curent_bin_size == BinTypeSize::kUint16BinsTypeSize) {
+      SetIndexData(qu, index.data<uint16_t>(), p_fmat_device, nbins, row_stride, offsets);
+    } else {
+      CHECK_EQ(curent_bin_size, BinTypeSize::kUint32BinsTypeSize);
+      SetIndexData(qu, index.data<uint32_t>(), p_fmat_device, nbins, row_stride, offsets);
+    }
+  /* For sparse DMatrix we have to store index of feature for each bin
+     in index field to chose right offset. So offset is nullptr and index is not reduced */
+  } else {
+    SetIndexData(qu, index.data<uint32_t>(), p_fmat_device, nbins, row_stride, offsets);
+  }
+}
+
+}  // namespace common
+}  // namespace sycl
+}  // namespace xgboost
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,3 +2,4 @@ library(testthat)
		library(xgboost)

		test_check("xgboost", reporter = ProgressReporter)
		RhpcBLASctl::omp_set_num_threads(1)