Make the span class trivially copyable. (#10878)

dmlc · Oct 9, 2024 · f7cb8e0 · f7cb8e0
1 parent 56d155c
commit f7cb8e0
Show file tree

Hide file tree

Showing 15 changed files with 25 additions and 36 deletions.
diff --git a/include/xgboost/span.h b/include/xgboost/span.h
@@ -433,7 +433,7 @@ class Span {
   using const_reverse_iterator = const std::reverse_iterator<const_iterator>;  // NOLINT
 
   // constructors
-  constexpr Span() __span_noexcept = default;
+  constexpr Span() = default;
 
   XGBOOST_DEVICE Span(pointer _ptr, index_type _count) :
       size_(_count), data_(_ptr) {
@@ -480,16 +480,11 @@ class Span {
       __span_noexcept : size_(_other.size()),
                         data_(_other.data()) {}
 
-  XGBOOST_DEVICE constexpr Span(const Span& _other)
-      __span_noexcept : size_(_other.size()), data_(_other.data()) {}
-
-  XGBOOST_DEVICE Span& operator=(const Span& _other) __span_noexcept {
-    size_ = _other.size();
-    data_ = _other.data();
-    return *this;
-  }
-
-  XGBOOST_DEVICE ~Span() __span_noexcept {};  // NOLINT
+  constexpr Span(Span const& _other) noexcept(true) = default;
+  constexpr Span& operator=(Span const& _other) noexcept(true) = default;
+  constexpr Span(Span&& _other) noexcept(true) = default;
+  constexpr Span& operator=(Span&& _other) noexcept(true) = default;
+  ~Span() noexcept(true) = default;
 
   XGBOOST_DEVICE constexpr iterator begin() const __span_noexcept {  // NOLINT
     return {this, 0};

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
@@ -1098,7 +1098,6 @@ XGB_DLL int XGBoosterTrainOneIter(BoosterHandle handle, DMatrixHandle dtrain, in
   auto ctx = learner->Ctx();
   if (!grad_is_cuda) {
     gpair.Reshape(i_grad.Shape(0), i_grad.Shape(1));
-    auto const shape = gpair.Shape();
     auto h_gpair = gpair.HostView();
     DispatchDType(i_grad, DeviceOrd::CPU(), [&](auto &&t_grad) {
       DispatchDType(i_hess, DeviceOrd::CPU(), [&](auto &&t_hess) {

diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
@@ -180,10 +180,9 @@ void RemoveDuplicatedCategories(Context const* ctx, MetaInfo const& info,
   sorted_entries.resize(n_uniques);
 
   // Renew the column scan and cut scan based on categorical data.
-  auto d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan);
   dh::caching_device_vector<SketchContainer::OffsetT> new_cuts_size(info.num_col_ + 1);
   CHECK_EQ(new_column_scan.size(), new_cuts_size.size());
-  dh::LaunchN(new_column_scan.size(),
+  dh::LaunchN(new_column_scan.size(), ctx->CUDACtx()->Stream(),
               [=, d_new_cuts_size = dh::ToSpan(new_cuts_size),
                d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan),
                d_new_columns_ptr = dh::ToSpan(new_column_scan)] __device__(size_t idx) {
@@ -277,7 +276,6 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
                                             HostDeviceVector<float>* p_out_weight) {
   if (hessian.empty()) {
     if (info.IsRanking() && !info.weights_.Empty()) {
-      common::Span<float const> group_weight = info.weights_.ConstDeviceSpan();
       dh::device_vector<bst_group_t> group_ptr(info.group_ptr_);
       auto d_group_ptr = dh::ToSpan(group_ptr);
       CHECK_GE(d_group_ptr.size(), 2) << "Must have at least 1 group for ranking.";

diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
@@ -167,7 +167,6 @@ void CopyGradient(Context const* ctx, linalg::Matrix<GradientPair> const* in_gpa
     GPUCopyGradient(ctx, in_gpair, group_id, out_gpair);
   } else {
     auto const& in = *in_gpair;
-    auto target_gpair = in.Slice(linalg::All(), group_id);
     auto h_tmp = out_gpair->HostView();
     auto h_in = in.HostView().Slice(linalg::All(), group_id);
     CHECK_EQ(h_tmp.Size(), h_in.Size());

diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
@@ -329,7 +329,6 @@ class EvalPrecision : public EvalRankWithCache<ltr::PreCache> {
 
     auto gptr = p_cache->DataGroupPtr(ctx_);
     auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
-    auto h_predt = linalg::MakeTensorView(ctx_, &predt, predt.Size());
     auto rank_idx = p_cache->SortedIdx(ctx_, predt.ConstHostSpan());
 
     auto weight = common::MakeOptionalWeights(ctx_, info.weights_);
@@ -433,7 +432,6 @@ class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
 
     auto gptr = p_cache->DataGroupPtr(ctx_);
     auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
-    auto h_predt = linalg::MakeTensorView(ctx_, &predt, predt.Size());
 
     auto map_gloc = p_cache->Map(ctx_);
     std::fill_n(map_gloc.data(), map_gloc.size(), 0.0);

diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu
@@ -488,9 +488,6 @@ void LambdaRankGetGradientPairwise(Context const* ctx, std::int32_t iter,
   info.labels.SetDevice(device);
   predt.SetDevice(device);
 
-  auto d_predt = predt.ConstDeviceSpan();
-  auto const d_sorted_idx = p_cache->SortedIdx(ctx, d_predt);
-
   auto delta = [] XGBOOST_DEVICE(float, float, std::size_t, std::size_t, bst_group_t) {
     return 1.0;
   };

diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
@@ -7,6 +7,7 @@
 #include "../../collective/allgather.h"
 #include "../../collective/communicator-inl.h"  // for GetWorldSize, GetRank
 #include "../../common/categorical.h"
+#include "../../common/cuda_context.cuh"  // for CUDAContext
 #include "evaluate_splits.cuh"
 #include "expand_entry.cuh"
 
@@ -472,8 +473,8 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_
 
 GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
                                                      EvaluateSplitSharedInputs shared_inputs) {
-  dh::device_vector<EvaluateSplitInputs> inputs(1);
-  dh::safe_cuda(cudaMemcpyAsync(inputs.data().get(), &input, sizeof(input), cudaMemcpyDefault));
+  dh::CachingDeviceUVector<EvaluateSplitInputs> inputs(1);
+  dh::safe_cuda(cudaMemcpyAsync(inputs.data(), &input, sizeof(input), cudaMemcpyDefault));
 
   dh::TemporaryArray<GPUExpandEntry> out_entries(1);
   this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs),

diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -10,7 +10,6 @@
 #include "../split_evaluator.h"
 #include "../updater_gpu_common.cuh"
 #include "expand_entry.cuh"
-#include "histogram.cuh"
 
 namespace xgboost {
 namespace common {

diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu
@@ -7,6 +7,7 @@
 #include <thrust/logical.h>  // thrust::any_of
 #include <thrust/sort.h>     // thrust::stable_sort
 
+#include "../../common/cuda_context.cuh"  // for CUDAContext
 #include "../../common/device_helpers.cuh"
 #include "../../common/hist_util.h"  // common::HistogramCuts
 #include "evaluate_splits.cuh"
@@ -53,9 +54,7 @@ void GPUHistEvaluator::Reset(Context const *ctx, common::HistogramCuts const &cu
      * cache feature index binary search result
      */
     feature_idx_.resize(cat_sorted_idx_.size());
-    auto d_fidxes = dh::ToSpan(feature_idx_);
     auto it = thrust::make_counting_iterator(0ul);
-    auto values = cuts.cut_values_.ConstDeviceSpan();
     thrust::transform(ctx->CUDACtx()->CTP(), it, it + feature_idx_.size(), feature_idx_.begin(),
                       [=] XGBOOST_DEVICE(size_t i) {
                         auto fidx = dh::SegmentId(ptrs, i);

diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu
@@ -146,7 +146,7 @@ TEST(HostDeviceVector, SetDevice) {
 
   vec.SetDevice(device);
   ASSERT_EQ(vec.Size(), h_vec.size());
-  auto span = vec.DeviceSpan();  // sync to device
+  vec.DeviceSpan();  // sync to device
 
   vec.SetDevice(DeviceOrd::CPU());  // pull back to cpu.
   ASSERT_EQ(vec.Size(), h_vec.size());

diff --git a/tests/cpp/common/test_linalg.cc b/tests/cpp/common/test_linalg.cc
@@ -117,9 +117,8 @@ TEST(Linalg, TensorView) {
   {
     // Don't assign the initial dimension, tensor should be able to deduce the correct dim
     // for Slice.
-    auto t = MakeTensorView(&ctx, data, 2, 3, 4);
-    auto s = t.Slice(1, 2, All());
-    static_assert(decltype(s)::kDimension == 1);
+    static_assert(decltype(MakeTensorView(&ctx, data, 2, 3, 4).Slice(1, 2, All()))::kDimension ==
+                  1);
   }
   {
     auto t = MakeTensorView(&ctx, data, 2, 3, 4);

diff --git a/tests/cpp/common/test_span.cc b/tests/cpp/common/test_span.cc
@@ -11,6 +11,15 @@
 #include "../../../src/common/transform_iterator.h"  // for MakeIndexTransformIter
 
 namespace xgboost::common {
+namespace {
+using ST = common::Span<int, dynamic_extent>;
+static_assert(std::is_trivially_copyable_v<ST>);
+static_assert(std::is_trivially_move_assignable_v<ST>);
+static_assert(std::is_trivially_move_constructible_v<ST>);
+static_assert(std::is_trivially_copy_assignable_v<ST>);
+static_assert(std::is_trivially_copy_constructible_v<ST>);
+}  // namespace
+
 TEST(Span, TestStatus) {
   int status = 1;
   TestTestStatus {&status}();

diff --git a/tests/cpp/data/test_extmem_quantile_dmatrix.cu b/tests/cpp/data/test_extmem_quantile_dmatrix.cu
@@ -23,8 +23,8 @@ class ExtMemQuantileDMatrixGpu : public ::testing::TestWithParam<std::tuple<floa
       ASSERT_EQ(orig_cuts.Ptrs(), sparse_cuts.Ptrs());
 
       std::vector<common::CompressedByteT> h_orig, h_sparse;
-      auto orig_acc = orig.Impl()->GetHostAccessor(ctx, &h_orig, {});
-      auto sparse_acc = sparse.Impl()->GetHostAccessor(ctx, &h_sparse, {});
+      [[maybe_unused]] auto orig_acc = orig.Impl()->GetHostAccessor(ctx, &h_orig, {});
+      [[maybe_unused]] auto sparse_acc = sparse.Impl()->GetHostAccessor(ctx, &h_sparse, {});
       ASSERT_EQ(h_orig.size(), h_sparse.size());
 
       auto equal = std::equal(h_orig.cbegin(), h_orig.cend(), h_sparse.cbegin());

diff --git a/tests/cpp/data/test_simple_dmatrix.cu b/tests/cpp/data/test_simple_dmatrix.cu
@@ -360,8 +360,6 @@ TEST(SimpleDMatrix, FromCupySparse){
   auto& batch = *dmat.GetBatches<SparsePage>().begin();
   auto page = batch.GetView();
 
-  auto inst0 = page[0];
-  auto inst1 = page[1];
   EXPECT_EQ(page[0].size(), 1);
   EXPECT_EQ(page[1].size(), 1);
   EXPECT_EQ(page[0][0].fvalue, 0.0f);

diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -104,7 +104,6 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   auto ctx = MakeCUDACtx(0);
 
   auto page = BuildEllpackPage(&ctx, kNRows, kNCols);
-  BatchParam batch_param{};
 
   xgboost::SimpleLCG gen;
   xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
@@ -448,7 +447,6 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
         auto impl = page.Impl();
         if (k == 0) {
           // Initialization
-          auto d_matrix = impl->GetDeviceAccessor(&ctx);
           fg = std::make_unique<FeatureGroups>(impl->Cuts());
           auto init = GradientPairInt64{0, 0};
           multi_hist = decltype(multi_hist)(impl->Cuts().TotalBins(), init);