rapidsai · raydouglass · May 28, 2024 · Apr 17, 2024 · Apr 17, 2024 · Apr 17, 2024
@@ -7,11 +7,13 @@ repos:
  - id: trailing-whitespace
  exclude: |
  (?x)^(
+ ^cpp/cmake/thirdparty/patches/.*|
  ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
  )
  - id: end-of-file-fixer
  exclude: |
  (?x)^(
+ ^cpp/cmake/thirdparty/patches/.*|
  ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
  )
  - repo: https://github.com/PyCQA/isort

@@ -18,6 +18,11 @@
  "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
  "fixed_in" : ""
  },
+ {
+ "file" : "${current_json_dir}/revert_pr_211_cccl_2.5.0.diff",
+ "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
+ "fixed_in" : ""
+ },
  {
  "file": "cccl/kernel_pointer_hiding.diff",
  "issue": "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]",
@@ -28,15 +33,30 @@
  "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
  "fixed_in" : ""
  },
+ {
+ "file" : "${current_json_dir}/thrust_disable_64bit_dispatching_cccl_2.5.0.diff",
+ "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
+ "fixed_in" : ""
+ },
  {
  "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff",
  "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
  "fixed_in" : ""
  },
+ {
+ "file" : "${current_json_dir}/thrust_faster_sort_compile_times_cccl_2.5.0.diff",
+ "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
+ "fixed_in" : ""
+ },
  {
  "file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff",
  "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
  "fixed_in" : ""
+ },
+ {
+ "file" : "${current_json_dir}/thrust_faster_scan_compile_times_cccl_2.5.0.diff",
+ "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
+ "fixed_in" : ""
  }
  ]
  }

@@ -0,0 +1,47 @@
+diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
+index 046eb83c0..8047c9701 100644
+--- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
++++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
+@@ -53,41 +53,15 @@ namespace cuda_cub
+
+ namespace __copy
+ {
+-template <class Derived, class InputIt, class OutputIt>
+-OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
+- execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::true_type)
+-{
+- typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
+- const auto n = thrust::distance(first, last);
+- if (n > 0)
+- {
+- cudaError status;
+- status = trivial_copy_device_to_device(
+- policy,
+- reinterpret_cast<InputTy*>(thrust::raw_pointer_cast(&*result)),
+- reinterpret_cast<InputTy const*>(thrust::raw_pointer_cast(&*first)),
+- n);
+- cuda_cub::throw_on_error(status, "__copy:: D->D: failed");
+- }
+-
+- return result + n;
+-}
+
+ template <class Derived, class InputIt, class OutputIt>
+ OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
+- execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::false_type)
++ execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
+ {
+ typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
+ return cuda_cub::transform(policy, first, last, result, thrust::identity<InputTy>());
+ }
+
+-template <class Derived, class InputIt, class OutputIt>
+-OutputIt THRUST_RUNTIME_FUNCTION
+-device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
+-{
+- return device_to_device(
+- policy, first, last, result, typename is_indirectly_trivially_relocatable_to<InputIt, OutputIt>::type());
+-}
+ } // namespace __copy
+
+ } // namespace cuda_cub
@@ -0,0 +1,25 @@
+diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
+index 2a3cc4e33..8fb337b26 100644
+--- a/thrust/thrust/system/cuda/detail/dispatch.h
++++ b/thrust/thrust/system/cuda/detail/dispatch.h
+@@ -44,8 +44,7 @@
+ } \
+ else \
+ { \
+- auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
+- status = call arguments; \
++ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
+ }
+
+ /**
+@@ -66,9 +65,7 @@
+ } \
+ else \
+ { \
+- auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1); \
+- auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2); \
+- status = call arguments; \
++ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
+ }
+ /**
+ * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
@@ -0,0 +1,39 @@
+diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+index 0606485bb..dbb99ff13 100644
+--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
++++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy
+ };
+
+ /// SM60 (GP100)
+- struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
++ struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+ {
+ enum
+ {
+diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
+index f39613adb..75bd16ff9 100644
+--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
++++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
+@@ -488,7 +488,7 @@ struct DeviceReducePolicy
+ };
+
+ /// SM60
+- struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
++ struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+ {
+ static constexpr int threads_per_block = 256;
+ static constexpr int items_per_thread = 16;
+diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+index 419908c4e..6ab0840e1 100644
+--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
++++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+@@ -339,7 +339,7 @@ struct DeviceScanPolicy
+ /// SM600
+ struct Policy600
+ : DefaultTuning
+- , ChainedPolicy<600, Policy600, Policy520>
++ , ChainedPolicy<600, Policy600, Policy600>
+ {};
+
+ /// SM800
@@ -0,0 +1,39 @@
+diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
+index eb76ebb0b..c6c529a50 100644
+--- a/cub/cub/block/block_merge_sort.cuh
++++ b/cub/cub/block/block_merge_sort.cuh
+@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
+ KeyT key1 = keys_shared[keys1_beg];
+ KeyT key2 = keys_shared[keys2_beg];
+
+-#pragma unroll
++#pragma unroll 1
+ for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+ {
+ bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
+@@ -376,7 +376,7 @@ public:
+ //
+ KeyT max_key = oob_default;
+
+-#pragma unroll
++#pragma unroll 1
+ for (int item = 1; item < ITEMS_PER_THREAD; ++item)
+ {
+ if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
+diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh
+index 7d9e8622f..da5627306 100644
+--- a/cub/cub/thread/thread_sort.cuh
++++ b/cub/cub/thread/thread_sort.cuh
+@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE
+ {
+ constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;
+
+-#pragma unroll
++#pragma unroll 1
+ for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+ {
+-#pragma unroll
++#pragma unroll 1
+ for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
+ {
+ if (compare_op(keys[j + 1], keys[j]))
@@ -18,6 +18,7 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/functional>
 #include <thrust/transform_reduce.h>
 
 namespace cudf::io {
@@ -32,9 +33,9 @@ writer_compression_statistics collect_compression_statistics(
  rmm::exec_policy(stream),
  results.begin(),
  results.end(),
- [] __device__(auto& res) {
+ cuda::proclaim_return_type<size_t>([] __device__(compression_result const& res) {
  return res.status == compression_status::SUCCESS ? res.bytes_written : 0;
- },
+ }),
  0ul,
  thrust::plus<size_t>());
 
@@ -47,9 +48,9 @@ writer_compression_statistics collect_compression_statistics(
  rmm::exec_policy(stream),
  zipped_begin,
  zipped_end,
- [status] __device__(auto tup) {
+ cuda::proclaim_return_type<size_t>([status] __device__(auto tup) {
  return thrust::get<1>(tup).status == status ? thrust::get<0>(tup).size() : 0;
- },
+ }),
  0ul,
  thrust::plus<size_t>());
  };

@@ -692,8 +692,7 @@ std::vector<range> find_table_splits(table_view const& input,
  d_sizes = d_segmented_sizes->view().begin<size_type>()] __device__(auto const segment_idx) {
  // Since the number of rows may not divisible by segment_length,
  // the last segment may be shorter than the others.
- auto const current_length =
- cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
+ auto const current_length = min(segment_length, num_rows - segment_length * segment_idx);
  auto const size = d_sizes[segment_idx] / CHAR_BIT; // divide by CHAR_BIT to get size in bytes
  return cumulative_size{static_cast<std::size_t>(current_length),
  static_cast<std::size_t>(size)};

@@ -561,20 +561,26 @@ void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info,
  uint32_t log2maxcr,
  rmm::cuda_stream_view stream)
 {
- dim3 dim_block(128, 1);
- dim3 dim_grid((num_streams + 3) >> 2, 1); // 1 stream per warp, 4 warps per block
- gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream.value()>>>(
- strm_info, num_streams, compression_block_size, log2maxcr);
+ auto const num_blocks = (num_streams + 3) >> 2; // 1 stream per warp, 4 warps per block
+ if (num_blocks > 0) {
+ dim3 dim_block(128, 1);
+ dim3 dim_grid(num_blocks, 1);
+ gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream.value()>>>(
+ strm_info, num_streams, compression_block_size, log2maxcr);
+ }
 }
 
 void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info,
  int32_t num_streams,
  rmm::cuda_stream_view stream)
 {
- dim3 dim_block(128, 1);
- dim3 dim_grid((num_streams + 3) >> 2, 1); // 1 stream per warp, 4 warps per block
- gpuPostDecompressionReassemble<<<dim_grid, dim_block, 0, stream.value()>>>(strm_info,
- num_streams);
+ auto const num_blocks = (num_streams + 3) >> 2; // 1 stream per warp, 4 warps per block
+ if (num_blocks > 0) {
+ dim3 dim_block(128, 1);
+ dim3 dim_grid(num_blocks, 1);
+ gpuPostDecompressionReassemble<<<dim_grid, dim_block, 0, stream.value()>>>(strm_info,
+ num_streams);
+ }
 }
 
 void __host__ ParseRowGroupIndex(RowGroup* row_groups,

@@ -1197,14 +1197,17 @@ void ComputePageStringSizes(cudf::detail::hostdevice_span<PageInfo> pages,
  cudf::detail::join_streams(streams, stream);
 
  // check for needed temp space for DELTA_BYTE_ARRAY
- auto const need_sizes = thrust::any_of(
- rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), [] __device__(auto& page) {
- return page.temp_string_size != 0;
- });
+ auto const need_sizes =
+ thrust::any_of(rmm::exec_policy(stream),
+ pages.device_begin(),
+ pages.device_end(),
+ cuda::proclaim_return_type<bool>(
+ [] __device__(auto& page) { return page.temp_string_size != 0; }));
 
  if (need_sizes) {
  // sum up all of the temp_string_sizes
- auto const page_sizes = [] __device__(PageInfo const& page) { return page.temp_string_size; };
+ auto const page_sizes = cuda::proclaim_return_type<int64_t>(
+ [] __device__(PageInfo const& page) { return page.temp_string_size; });
  auto const total_size = thrust::transform_reduce(rmm::exec_policy(stream),
  pages.device_begin(),
  pages.device_end(),

@@ -452,9 +452,9 @@ std::string encoding_to_string(Encoding encoding)
 [[nodiscard]] std::string list_unsupported_encodings(device_span<PageInfo const> pages,
  rmm::cuda_stream_view stream)
 {
- auto const to_mask = [] __device__(auto const& page) {
+ auto const to_mask = cuda::proclaim_return_type<uint32_t>([] __device__(auto const& page) {
  return is_supported_encoding(page.encoding) ? 0U : encoding_to_mask(page.encoding);
- };
+ });
  uint32_t const unsupported = thrust::transform_reduce(
  rmm::exec_policy(stream), pages.begin(), pages.end(), to_mask, 0U, thrust::bit_or<uint32_t>());
  return encoding_bitmask_to_str(unsupported);

@@ -34,6 +34,7 @@
 #include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/functional>
 #include <thrust/copy.h>
 #include <thrust/functional.h>
 #include <thrust/transform_reduce.h>
@@ -783,7 +784,8 @@ template <typename SymbolT>
 struct to_string_view_pair {
  SymbolT const* data;
  to_string_view_pair(SymbolT const* _data) : data(_data) {}
- __device__ auto operator()(thrust::tuple<size_type, size_type> ip)
+ __device__ thrust::pair<char const*, std::size_t> operator()(
+ thrust::tuple<size_type, size_type> ip)
  {
  return thrust::pair<char const*, std::size_t>{data + thrust::get<0>(ip),
  static_cast<std::size_t>(thrust::get<1>(ip))};
@@ -805,7 +807,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
  rmm::exec_policy(stream),
  str_tuples,
  str_tuples + col_size,
- [] __device__(auto t) { return t.second; },
+ cuda::proclaim_return_type<std::size_t>([] __device__(auto t) { return t.second; }),
  size_type{0},
  thrust::maximum<size_type>{});
 

@@ -182,7 +182,7 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
  thrust::make_transform_output_iterator(probe_indices->begin(), output_fn{});
 
  auto const [probe_indices_end, _] = this->_hash_table.retrieve(
- iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, stream.value());
+ iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, {stream.value()});
 
  auto const actual_size = std::distance(probe_indices_begin, probe_indices_end);
  build_indices->resize(actual_size, stream);

@@ -219,9 +219,9 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
  rmm::exec_policy(stream),
  thrust::make_counting_iterator<size_type>(0),
  thrust::make_counting_iterator<size_type>(strings_count),
- [d_offsets] __device__(auto const idx) -> size_type {
+ cuda::proclaim_return_type<size_type>([d_offsets] __device__(auto const idx) -> size_type {
  return static_cast<size_type>(d_offsets[idx + 1] - d_offsets[idx]);
- },
+ }),
  0,
  thrust::maximum<size_type>{});
 

@@ -69,7 +69,6 @@ struct InsertTest : public cudf::test::BaseFixture {
 
 using TestTypes = ::testing::Types<key_value_types<int32_t, int32_t>,
  key_value_types<int64_t, int64_t>,
- key_value_types<int16_t, int16_t>,
  key_value_types<int32_t, float>,
  key_value_types<int64_t, double>>;
 

diff --git a/rapids_config.cmake b/rapids_config.cmake
@@ -26,6 +26,9 @@ else()
  )
 endif()
 
+set(rapids-cmake-repo trxcllnt/rapids-cmake)
+set(rapids-cmake-branch fea/cccl-2.5)
+
 if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS-${RAPIDS_VERSION_MAJOR_MINOR}.cmake")
  file(
  DOWNLOAD