dmlc · trivialfis · May 31, 2024 · May 31, 2024 · May 31, 2024 · May 31, 2024
diff --git a/include/xgboost/base.h b/include/xgboost/base.h
@@ -31,13 +31,6 @@
 #define XGBOOST_LOG_WITH_TIME 1
 #endif  // XGBOOST_LOG_WITH_TIME
 
-/*!
- * \brief Whether to customize global PRNG.
- */
-#ifndef XGBOOST_CUSTOMIZE_GLOBAL_PRNG
-#define XGBOOST_CUSTOMIZE_GLOBAL_PRNG 0
-#endif  // XGBOOST_CUSTOMIZE_GLOBAL_PRNG
-
 /*!
  * \brief Check if alignas(*) keyword is supported. (g++ 4.8 or higher)
  */

diff --git a/include/xgboost/context.h b/include/xgboost/context.h
@@ -15,8 +15,11 @@
 #include <type_traits>  // for invoke_result_t, is_same_v, underlying_type_t
 
 namespace xgboost {
-
+class Json;
 struct CUDAContext;
+namespace common {
+class RandomEngine;
+}  // namespace common
 
 // symbolic names
 struct DeviceSym {
@@ -46,9 +49,7 @@ struct DeviceOrd {
   [[nodiscard]] bool IsSyclDefault() const { return device == kSyclDefault; }
   [[nodiscard]] bool IsSyclCPU() const { return device == kSyclCPU; }
   [[nodiscard]] bool IsSyclGPU() const { return device == kSyclGPU; }
-  [[nodiscard]] bool IsSycl() const { return (IsSyclDefault() ||
-                                              IsSyclCPU() ||
-                                              IsSyclGPU()); }
+  [[nodiscard]] bool IsSycl() const { return (IsSyclDefault() || IsSyclCPU() || IsSyclGPU()); }
 
   constexpr DeviceOrd() = default;
   constexpr DeviceOrd(Type type, bst_d_ordinal_t ord) : device{type}, ordinal{ord} {}
@@ -296,6 +297,11 @@ struct Context : public XGBoostParameter<Context> {
         .describe("Enable checking whether parameters are used or not.");
   }
 
+  [[nodiscard]] auto& Rng() const { return *rng_; }
+
+  void SaveConfig(Json* out) const;
+  void LoadConfig(Json const& in);
+
  private:
   void SetDeviceOrdinal(Args const& kwargs);
   Context& SetDevice(DeviceOrd d) {
@@ -307,6 +313,8 @@ struct Context : public XGBoostParameter<Context> {
   // shared_ptr is used instead of unique_ptr as with unique_ptr it's difficult to define
   // p_impl while trying to hide CUDA code from the host compiler.
   mutable std::shared_ptr<CUDAContext> cuctx_;
+  // mutable for random engine. The rng is shared by child contexts, if there's any.
+  mutable std::shared_ptr<common::RandomEngine> rng_;
   // cached value for CFS CPU limit. (used in containerized env)
   std::int32_t cfs_cpu_count_;  // NOLINT
 };

diff --git a/src/common/common.cc b/src/common/common.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023 by Contributors
+ * Copyright 2015-2024 by Contributors
  */
 #include "common.h"
 
@@ -9,19 +9,7 @@
 #include <cstdio>   // for snprintf, size_t
 #include <string>   // for string
 
-#include "./random.h"  // for GlobalRandomEngine, GlobalRandom
-
 namespace xgboost::common {
-/*! \brief thread local entry for random. */
-struct RandomThreadLocalEntry {
-  /*! \brief the random engine instance. */
-  GlobalRandomEngine engine;
-};
-
-using RandomThreadLocalStore = dmlc::ThreadLocalStore<RandomThreadLocalEntry>;
-
-GlobalRandomEngine &GlobalRandom() { return RandomThreadLocalStore::Get()->engine; }
-
 void EscapeU8(std::string const &string, std::string *p_buffer) {
   auto &buffer = *p_buffer;
   for (size_t i = 0; i < string.length(); i++) {

diff --git a/src/common/random.cu b/src/common/random.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <thrust/shuffle.h>  // for shuffle
 
@@ -19,7 +19,7 @@ void WeightedSamplingWithoutReplacement(Context const *ctx, common::Span<bst_fea
                                         common::Span<float const> weights,
                                         common::Span<bst_feature_t> results,
                                         HostDeviceVector<bst_feature_t> *sorted_idx,
-                                        GlobalRandomEngine *grng) {
+                                        RandomEngine *grng) {
   CUDAContext const *cuctx = ctx->CUDACtx();
   CHECK_EQ(array.size(), weights.size());
   // Sampling keys
@@ -61,7 +61,7 @@ void SampleFeature(Context const *ctx, bst_feature_t n_features,
                    std::shared_ptr<HostDeviceVector<bst_feature_t>> p_new_features,
                    HostDeviceVector<float> const &feature_weights,
                    HostDeviceVector<float> *weight_buffer,
-                   HostDeviceVector<bst_feature_t> *idx_buffer, GlobalRandomEngine *grng) {
+                   HostDeviceVector<bst_feature_t> *idx_buffer, RandomEngine *grng) {
   CUDAContext const *cuctx = ctx->CUDACtx();
   auto &new_features = *p_new_features;
   new_features.SetDevice(ctx->Device());

diff --git a/src/common/random.h b/src/common/random.h
@@ -7,80 +7,24 @@
 #ifndef XGBOOST_COMMON_RANDOM_H_
 #define XGBOOST_COMMON_RANDOM_H_
 
-#include <xgboost/logging.h>
-
 #include <algorithm>
 #include <functional>
-#include <limits>
 #include <map>
 #include <memory>
 #include <numeric>
 #include <random>
 #include <utility>
 #include <vector>
 
-#include "../collective/broadcast.h"  // for Broadcast
-#include "../collective/communicator-inl.h"
-#include "algorithm.h"  // ArgSort
-#include "common.h"
-#include "xgboost/context.h"  // Context
-#include "xgboost/host_device_vector.h"
-#include "xgboost/linalg.h"
+#include "../collective/broadcast.h"     // for Broadcast
+#include "algorithm.h"                   // ArgSort
+#include "xgboost/context.h"             // Context
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+#include "xgboost/linalg.h"              // for MakeVec
+#include "xgboost/logging.h"
 
 namespace xgboost::common {
-/*!
- * \brief Define mt19937 as default type Random Engine.
- */
-using RandomEngine = std::mt19937;
-
-#if defined(XGBOOST_CUSTOMIZE_GLOBAL_PRNG) && XGBOOST_CUSTOMIZE_GLOBAL_PRNG == 1
-/*!
- * \brief An customized random engine, used to be plugged in PRNG from other systems.
- *  The implementation of this library is not provided by xgboost core library.
- *  Instead the other library can implement this class, which will be used as GlobalRandomEngine
- *  If XGBOOST_RANDOM_CUSTOMIZE = 1, by default this is switched off.
- */
-class CustomGlobalRandomEngine {
- public:
-  /*! \brief The result type */
-  using result_type = uint32_t;
-  /*! \brief The minimum of random numbers generated */
-  inline static constexpr result_type min() {
-    return 0;
-  }
-  /*! \brief The maximum random numbers generated */
-  inline static constexpr result_type max() {
-    return std::numeric_limits<result_type>::max();
-  }
-  /*!
-   * \brief seed function, to be implemented
-   * \param val The value of the seed.
-   */
-  void seed(result_type val);
-  /*!
-   * \return next random number.
-   */
-  result_type operator()();
-};
-
-/*!
- * \brief global random engine
- */
-typedef CustomGlobalRandomEngine GlobalRandomEngine;
-
-#else
-/*!
- * \brief global random engine
- */
-using GlobalRandomEngine = RandomEngine;
-#endif  // XGBOOST_CUSTOMIZE_GLOBAL_PRNG
-
-/*!
- * \brief global singleton of a random engine.
- *  This random engine is thread-local and
- *  only visible to current thread.
- */
-GlobalRandomEngine& GlobalRandom(); // NOLINT(*)
+class RandomEngine : public std::mt19937 {};
 
 /*
  * Original paper:
@@ -96,7 +40,7 @@ std::vector<T> WeightedSamplingWithoutReplacement(Context const* ctx, std::vecto
   CHECK_EQ(array.size(), weights.size());
   std::vector<float> keys(weights.size());
   std::uniform_real_distribution<float> dist;
-  auto& rng = GlobalRandom();
+  auto& rng = ctx->Rng();
   for (size_t i = 0; i < array.size(); ++i) {
     auto w = std::max(weights.at(i), kRtEps);
     auto u = dist(rng);
@@ -120,7 +64,7 @@ void SampleFeature(Context const* ctx, bst_feature_t n_features,
                    std::shared_ptr<HostDeviceVector<bst_feature_t>> p_new_features,
                    HostDeviceVector<float> const& feature_weights,
                    HostDeviceVector<float>* weight_buffer,
-                   HostDeviceVector<bst_feature_t>* idx_buffer, GlobalRandomEngine* grng);
+                   HostDeviceVector<bst_feature_t>* idx_buffer, RandomEngine* grng);
 
 void InitFeatureSet(Context const* ctx,
                     std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features);
@@ -140,7 +84,7 @@ class ColumnSampler {
   float colsample_bylevel_{1.0f};
   float colsample_bytree_{1.0f};
   float colsample_bynode_{1.0f};
-  GlobalRandomEngine rng_;
+  RandomEngine rng_;
   Context const* ctx_;
 
   // Used for weighted sampling.
@@ -230,7 +174,7 @@ class ColumnSampler {
 };
 
 inline auto MakeColumnSampler(Context const* ctx) {
-  std::uint32_t seed = common::GlobalRandomEngine()();
+  std::uint32_t seed = ctx->Rng()();
   auto rc = collective::Broadcast(ctx, linalg::MakeVec(&seed, 1), 0);
   collective::SafeColl(rc);
   auto cs = std::make_shared<common::ColumnSampler>(seed);

diff --git a/src/context.cc b/src/context.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023 by XGBoost Contributors
+ * Copyright 2014-2024, XGBoost Contributors
  *
  * \brief Context object used for controlling runtime parameters.
  */
@@ -8,21 +8,27 @@
 #include <algorithm>  // for find_if
 #include <charconv>   // for from_chars
 #include <iterator>   // for distance
+#include <locale>     // for locale
 #include <optional>   // for optional
 #include <regex>      // for regex_replace, regex_match
+#include <sstream>    // for stringstream
 
 #include "common/common.h"     // AssertGPUSupport
 #include "common/error_msg.h"  // WarnDeprecatedGPUId
 #include "common/threading_utils.h"
+#include "xgboost/json.h"  // for Json
 #include "xgboost/string_view.h"
+#include "common/random.h"  // for RandomEngin
 
 namespace xgboost {
-
 DMLC_REGISTER_PARAMETER(Context);
 
 std::int64_t constexpr Context::kDefaultSeed;
 
-Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}
+Context::Context()
+    : rng_{std::make_shared<common::RandomEngine>()}, cfs_cpu_count_{common::GetCfsCPUCount()} {
+  rng_->seed(kDefaultSeed);
+}
 
 namespace {
 inline constexpr char const* kDevice = "device";
@@ -219,6 +225,24 @@ void Context::Init(Args const& kwargs) {
   }
 }
 
+void Context::SaveConfig(Json* out) const {
+  (*out) = ToJson(*this);
+  std::stringstream ss;
+  ss.imbue(std::locale{"en_US.UTF8"});
+  ss << this->Rng();
+  (*out)["rng"] = ss.str();
+}
+
+void Context::LoadConfig(Json const& in) {
+  FromJson(in, this);
+  std::stringstream ss;
+  ss.imbue(std::locale{"en_US.UTF8"});
+  ss << get<String const>(in["rng"]);
+  ss >> this->Rng();
+  // make sure the GPU ID is valid in new environment before start running configure.
+  this->ConfigureGpuId(false);
+}
+
 void Context::ConfigureGpuId(bool require_gpu) {
   if (this->IsCPU() && require_gpu) {
     this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});

diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
@@ -10,8 +10,7 @@
 #include <dmlc/parameter.h>
 
 #include <algorithm>  // for equal
-#include <cinttypes>  // for uint32_t
-#include <limits>
+#include <cstdint>  // for uint32_t
 #include <memory>
 #include <string>
 #include <utility>
@@ -928,7 +927,7 @@ class Dart : public GBTree {
     idx_drop_.clear();
 
     std::uniform_real_distribution<> runif(0.0, 1.0);
-    auto& rnd = common::GlobalRandom();
+    auto& rnd = ctx_->Rng();
     bool skip = false;
     if (dparam_.skip_drop > 0.0) skip = (runif(rnd) < dparam_.skip_drop);
     // sample some trees to drop

diff --git a/src/learner.cc b/src/learner.cc
@@ -23,13 +23,11 @@
 #include <limits>                         // for numeric_limits
 #include <memory>                         // for allocator, unique_ptr, shared_ptr, operator==
 #include <mutex>                          // for mutex, lock_guard
-#include <set>                            // for set
 #include <sstream>                        // for operator<<, basic_ostream, basic_ostream::opera...
 #include <stack>                          // for stack
 #include <string>                         // for basic_string, char_traits, operator<, string
 #include <system_error>                   // for errc
 #include <tuple>                          // for get
-#include <unordered_map>                  // for operator!=, unordered_map
 #include <utility>                        // for pair, as_const, move, swap
 #include <vector>                         // for vector
 
@@ -41,7 +39,7 @@
 #include "common/error_msg.h"             // for MaxFeatureSize, WarnOldSerialization, ...
 #include "common/io.h"                    // for PeekableInStream, ReadAll, FixedSizeStream, Mem...
 #include "common/observer.h"              // for TrainingObserver
-#include "common/random.h"                // for GlobalRandom
+#include "common/random.h"                // for RandomEngine
 #include "common/timer.h"                 // for Monitor
 #include "common/version.h"               // for Version
 #include "dmlc/endian.h"                  // for ByteSwap, DMLC_IO_NO_ENDIAN_SWAP
@@ -476,7 +474,7 @@ class LearnerConfiguration : public Learner {
 
     // set seed only before the model is initialized
     if (!initialized || ctx_.seed != old_seed) {
-      common::GlobalRandom().seed(ctx_.seed);
+      ctx_.Rng().seed(ctx_.seed);
     }
 
     // must precede configure gbm since num_features is required for gbm
@@ -556,9 +554,7 @@ class LearnerConfiguration : public Learner {
       }
     }
 
-    FromJson(learner_parameters.at("generic_param"), &ctx_);
-    // make sure the GPU ID is valid in new environment before start running configure.
-    ctx_.ConfigureGpuId(false);
+    ctx_.LoadConfig(learner_parameters.at("generic_param"));
 
     this->need_configuration_ = true;
   }
@@ -588,7 +584,8 @@ class LearnerConfiguration : public Learner {
     }
     learner_parameters["metrics"] = Array(std::move(metrics));
 
-    learner_parameters["generic_param"] = ToJson(ctx_);
+    learner_parameters["generic_param"] = Object{};
+    ctx_.SaveConfig(&learner_parameters["generic_param"]);
   }
 
   void SetParam(const std::string& key, const std::string& value) override {
@@ -1271,7 +1268,7 @@ class LearnerImpl : public LearnerIO {
     this->InitBaseScore(train.get());
 
     if (ctx_.seed_per_iteration) {
-      common::GlobalRandom().seed(ctx_.seed * kRandSeedMagic + iter);
+      ctx_.Rng().seed(ctx_.seed * kRandSeedMagic + iter);
     }
 
     this->ValidateDMatrix(train.get(), true);
@@ -1298,7 +1295,7 @@ class LearnerImpl : public LearnerIO {
     this->Configure();
 
     if (ctx_.seed_per_iteration) {
-      common::GlobalRandom().seed(ctx_.seed * kRandSeedMagic + iter);
+      ctx_.Rng().seed(ctx_.seed * kRandSeedMagic + iter);
     }
 
     this->ValidateDMatrix(train.get(), true);

diff --git a/src/linear/coordinate_common.h b/src/linear/coordinate_common.h
@@ -278,13 +278,13 @@ class CyclicFeatureSelector : public FeatureSelector {
 class ShuffleFeatureSelector : public FeatureSelector {
  public:
   using FeatureSelector::FeatureSelector;
-  void Setup(Context const *, const gbm::GBLinearModel &model, const std::vector<GradientPair> &,
+  void Setup(Context const *ctx, const gbm::GBLinearModel &model, const std::vector<GradientPair> &,
              DMatrix *, float, float, int) override {
     if (feat_index_.size() == 0) {
       feat_index_.resize(model.learner_model_param->num_feature);
       std::iota(feat_index_.begin(), feat_index_.end(), 0);
     }
-    std::shuffle(feat_index_.begin(), feat_index_.end(), common::GlobalRandom());
+    std::shuffle(feat_index_.begin(), feat_index_.end(), ctx->Rng());
   }
 
   int NextFeature(Context const *, int iteration, const gbm::GBLinearModel &model, int,
@@ -303,9 +303,9 @@ class ShuffleFeatureSelector : public FeatureSelector {
 class RandomFeatureSelector : public FeatureSelector {
  public:
   using FeatureSelector::FeatureSelector;
-  int NextFeature(Context const *, int, const gbm::GBLinearModel &model, int,
+  int NextFeature(Context const *ctx, int, const gbm::GBLinearModel &model, int,
                   const std::vector<GradientPair> &, DMatrix *, float, float) override {
-    return common::GlobalRandom()() % model.learner_model_param->num_feature;
+    return ctx->Rng()() % model.learner_model_param->num_feature;
   }
 };
 

diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -187,7 +187,7 @@ GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<Gra
   auto cuctx = ctx->CUDACtx();
   thrust::replace_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
                      thrust::counting_iterator<std::size_t>(0),
-                     BernoulliTrial(common::GlobalRandom()(), subsample_), GradientPair());
+                     BernoulliTrial(ctx->Rng()(), subsample_), GradientPair());
   auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
   return {dmat->Info().num_row_, page, gpair};
 }
@@ -206,7 +206,7 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
   // Set gradient pair to 0 with p = 1 - subsample
   thrust::replace_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
                      thrust::counting_iterator<std::size_t>(0),
-                     BernoulliTrial(common::GlobalRandom()(), subsample_), GradientPair{});
+                     BernoulliTrial(ctx->Rng()(), subsample_), GradientPair{});
 
   // Count the sampled rows.
   size_t sample_rows =
@@ -260,7 +260,7 @@ GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
   thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
                     thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
                     PoissonSampling(dh::ToSpan(threshold_), threshold_index,
-                                    RandomWeight(common::GlobalRandom()())));
+                                    RandomWeight(ctx->Rng()())));
   return {n_rows, page, gpair};
 }
 
@@ -282,10 +282,10 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
       gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);
 
   // Perform Poisson sampling in place.
-  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
-                    thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
-                    PoissonSampling(dh::ToSpan(threshold_), threshold_index,
-                                    RandomWeight(common::GlobalRandom()())));
+  thrust::transform(
+      cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), thrust::counting_iterator<size_t>(0),
+      dh::tbegin(gpair),
+      PoissonSampling(dh::ToSpan(threshold_), threshold_index, RandomWeight(ctx->Rng()())));
 
   // Count the sampled rows.
   size_t sample_rows = thrust::count_if(dh::tbegin(gpair), dh::tend(gpair), IsNonZero());

diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
@@ -12,19 +12,20 @@
 #include <utility>    // for move
 #include <vector>     // for vector
 
-#include "../../collective/allgather.h"
-#include "../../common/categorical.h"  // for CatBitField
-#include "../../common/hist_util.h"    // for GHistRow, HistogramCuts
-#include "../../common/linalg_op.h"    // for cbegin, cend, begin
-#include "../../common/random.h"       // for ColumnSampler
-#include "../constraints.h"            // for FeatureInteractionConstraintHost
-#include "../param.h"                  // for TrainParam
-#include "../split_evaluator.h"        // for TreeEvaluator
-#include "expand_entry.h"              // for MultiExpandEntry
-#include "hist_cache.h"                // for BoundedHistCollection
-#include "xgboost/base.h"              // for bst_node_t, bst_target_t, bst_feature_t
-#include "xgboost/context.h"           // for COntext
-#include "xgboost/linalg.h"            // for Constants, Vector
+#include "../../collective/allgather.h"         // for VectorAllgatherV
+#include "../../collective/communicator-inl.h"  // for GetWorldSize
+#include "../../common/categorical.h"           // for CatBitField
+#include "../../common/hist_util.h"             // for GHistRow, HistogramCuts
+#include "../../common/linalg_op.h"             // for cbegin, cend, begin
+#include "../../common/random.h"                // for ColumnSampler
+#include "../constraints.h"                     // for FeatureInteractionConstraintHost
+#include "../param.h"                           // for TrainParam
+#include "../split_evaluator.h"                 // for TreeEvaluator
+#include "expand_entry.h"                       // for MultiExpandEntry
+#include "hist_cache.h"                         // for BoundedHistCollection
+#include "xgboost/base.h"                       // for bst_node_t, bst_target_t, bst_feature_t
+#include "xgboost/context.h"                    // for COntext
+#include "xgboost/linalg.h"                     // for Constants, Vector
 
 namespace xgboost::tree {
 /**

diff --git a/src/tree/hist/sampler.h b/src/tree/hist/sampler.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023 by XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #ifndef XGBOOST_TREE_HIST_SAMPLER_H_
 #define XGBOOST_TREE_HIST_SAMPLER_H_
@@ -8,12 +8,12 @@
 #include <cstdint>  // std::uint64_t
 #include <random>   // bernoulli_distribution, linear_congruential_engine
 
-#include "../../common/random.h"  // GlobalRandom
-#include "../param.h"             // TrainParam
-#include "xgboost/base.h"         // GradientPair
-#include "xgboost/context.h"      // Context
-#include "xgboost/data.h"         // MetaInfo
-#include "xgboost/linalg.h"       // TensorView
+#include "../../common/random.h"  // for RandomEngine
+#include "../param.h"             // for TrainParam
+#include "xgboost/base.h"         // for GradientPair
+#include "xgboost/context.h"      // for Context
+#include "xgboost/data.h"         // for MetaInfo
+#include "xgboost/linalg.h"       // for TensorView
 
 namespace xgboost {
 namespace tree {
@@ -55,18 +55,9 @@ inline void SampleGradient(Context const* ctx, TrainParam param,
     return;
   }
   bst_idx_t n_samples = out.Shape(0);
-  auto& rnd = common::GlobalRandom();
+  auto& rng = ctx->Rng();
 
-#if XGBOOST_CUSTOMIZE_GLOBAL_PRNG
-  std::bernoulli_distribution coin_flip(param.subsample);
-  CHECK_EQ(out.Shape(1), 1) << "Multi-target with sampling for R is not yet supported.";
-  for (size_t i = 0; i < n_samples; ++i) {
-    if (!(out(i, 0).GetHess() >= 0.0f && coin_flip(rnd)) || out(i, 0).GetGrad() == 0.0f) {
-      out(i, 0) = GradientPair(0);
-    }
-  }
-#else
-  std::uint64_t initial_seed = rnd();
+  std::uint64_t initial_seed = rng();
 
   auto n_threads = static_cast<size_t>(ctx->Threads());
   std::size_t const discard_size = n_samples / n_threads;
@@ -102,7 +93,6 @@ inline void SampleGradient(Context const* ctx, TrainParam param,
     });
   }
   exc.Rethrow();
-#endif  // XGBOOST_CUSTOMIZE_GLOBAL_PRNG
 }
 }  // namespace tree
 }  // namespace xgboost

diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
@@ -8,7 +8,8 @@
 #include <cmath>
 #include <vector>
 
-#include "../common/error_msg.h"  // for NoCategorical
+#include "../collective/communicator-inl.h"  // for IsDistributed
+#include "../common/error_msg.h"             // for NoCategorical
 #include "../common/random.h"
 #include "constraints.h"
 #include "param.h"
@@ -224,8 +225,8 @@ class ColMaker: public TreeUpdater {
             << "Only uniform sampling is supported, "
             << "gradient-based sampling is only support by GPU Hist.";
           std::bernoulli_distribution coin_flip(param_.subsample);
-          auto& rnd = common::GlobalRandom();
-          for (size_t ridx = 0; ridx < position_.size(); ++ridx) {
+          auto &rnd = ctx_->Rng();
+          for (bst_idx_t ridx = 0; ridx < position_.size(); ++ridx) {
             if (gpair[ridx].GetHess() < 0.0f) continue;
             if (!coin_flip(rnd)) position_[ridx] = ~position_[ridx];
           }

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
@@ -867,7 +867,7 @@ class GPUHistMaker : public TreeUpdater {
     info_ = &dmat->Info();
 
     // Synchronise the column sampling seed
-    uint32_t column_sampling_seed = common::GlobalRandom()();
+    std::uint32_t column_sampling_seed = ctx_->Rng()();
     auto rc = collective::Broadcast(
         ctx_, linalg::MakeVec(&column_sampling_seed, sizeof(column_sampling_seed)), 0);
     SafeColl(rc);
@@ -1011,7 +1011,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
 
     monitor_.Start(__func__);
     CHECK(ctx_->IsCUDA()) << error::InvalidCUDAOrdinal();
-    uint32_t column_sampling_seed = common::GlobalRandom()();
+    std::uint32_t column_sampling_seed = ctx_->Rng()();
     this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
 
     p_last_fmat_ = p_fmat;

diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
@@ -399,7 +399,7 @@ TEST(Learner, ConstantSeed) {
   learner->Configure();  // seed the global random
 
   std::uniform_real_distribution<float> dist;
-  auto& rng = common::GlobalRandom();
+  auto& rng = learner->Ctx()->Rng();
   float v_0 = dist(rng);
 
   learner->SetParam("", "");

diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2019-2023, XGBoost Contributors
+ * Copyright (c) 2019-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/base.h>
@@ -148,13 +148,12 @@ void TestLearnerSerialization(Args args, FeatureMap const& fmap, std::shared_ptr
     std::string continued_model;
     {
       // Continue the previous training with another kIters
-      std::unique_ptr<dmlc::Stream> fi(
-          dmlc::Stream::Create(fname.c_str(), "r"));
+      std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r"));
       std::unique_ptr<Learner> learner{Learner::Create({p_dmat})};
       learner->Load(fi.get());
       learner->Configure();
 
-      // verify the loaded model doesn't change.
+      // Verify the loaded model doesn't change.
       std::string serialised_model_tmp;
       common::MemoryBufferStream mem_out(&serialised_model_tmp);
       learner->Save(&mem_out);
@@ -484,7 +483,7 @@ class LogitSerializationTest : public SerializationTest {
     auto& h_labels = p_dmat->Info().labels.Data()->HostVector();
 
     std::bernoulli_distribution flip(0.5);
-    auto& rnd = common::GlobalRandom();
+    auto& rnd = p_dmat->Ctx()->Rng();
     rnd.seed(0);
 
     for (auto& v : h_labels) { v = flip(rnd); }
@@ -608,7 +607,7 @@ class MultiClassesSerializationTest : public SerializationTest {
     auto &h_labels = p_dmat->Info().labels.Data()->HostVector();
 
     std::uniform_int_distribution<size_t> categorical(0, kClasses - 1);
-    auto& rnd = common::GlobalRandom();
+    auto& rnd = p_dmat->Ctx()->Rng();
     rnd.seed(0);
 
     for (auto& v : h_labels) { v = categorical(rnd); }

diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
@@ -258,7 +258,6 @@ TEST(GpuHist, UniformSampling) {
   constexpr size_t kRows = 4096;
   constexpr size_t kCols = 2;
   constexpr float kSubsample = 0.9999;
-  common::GlobalRandom().seed(1994);
 
   // Create an in-memory DMatrix.
   std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
@@ -274,8 +273,11 @@ TEST(GpuHist, UniformSampling) {
   // Build another tree using sampling.
   RegTree tree_sampling;
   HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, DeviceOrd::CUDA(0));
-  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample, "uniform",
-             kRows);
+  {
+    auto ctx = MakeCUDACtx(0);
+    UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample, "uniform",
+               kRows);
+  }
 
   // Make sure the predictions are the same.
   auto preds_h = preds.ConstHostVector();
@@ -289,7 +291,6 @@ TEST(GpuHist, GradientBasedSampling) {
   constexpr size_t kRows = 4096;
   constexpr size_t kCols = 2;
   constexpr float kSubsample = 0.9999;
-  common::GlobalRandom().seed(1994);
 
   // Create an in-memory DMatrix.
   std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
@@ -306,8 +307,11 @@ TEST(GpuHist, GradientBasedSampling) {
   // Build another tree using sampling.
   RegTree tree_sampling;
   HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, DeviceOrd::CUDA(0));
-  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample,
-             "gradient_based", kRows);
+  {
+    auto ctx = MakeCUDACtx(0);
+    UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample,
+               "gradient_based", kRows);
+  }
 
   // Make sure the predictions are the same.
   auto preds_h = preds.ConstHostVector();
@@ -358,7 +362,6 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
   constexpr size_t kPageSize = 1024;
   constexpr float kSubsample = 0.5;
   const std::string kSamplingMethod = "gradient_based";
-  common::GlobalRandom().seed(0);
 
   dmlc::TemporaryDirectory tmpdir;
 
@@ -374,18 +377,19 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
-  auto rng = common::GlobalRandom();
-
   RegTree tree;
   HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, kSubsample, kSamplingMethod, kRows);
 
   // Build another tree using multiple ELLPACK pages.
-  common::GlobalRandom() = rng;
+
   RegTree tree_ext;
   HostDeviceVector<bst_float> preds_ext(kRows, 0.0, DeviceOrd::CUDA(0));
-  UpdateTree(&ctx, &gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, kSubsample,
-             kSamplingMethod, kRows);
+  {
+    Context ctx(MakeCUDACtx(0));
+    UpdateTree(&ctx, &gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, kSubsample,
+               kSamplingMethod, kRows);
+  }
 
   // Make sure the predictions are the same.
   auto preds_h = preds.ConstHostVector();