Merge branch 'branch-24.08' of github.com:rapidsai/cudf into pylibcud…

…f-io-writers
lithomas1 · Jun 29, 2024 · e57a677 · e57a677
2 parents e940e30 + 3c3edfe
commit e57a677
Show file tree

Hide file tree

Showing 87 changed files with 2,899 additions and 1,573 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -71,15 +71,14 @@ for a minimal build of libcudf without using conda are also listed below.
 
 Compilers:
 
-* `gcc` version 9.3+
-* `nvcc` version 11.5+
-* `cmake` version 3.26.4+
+* `gcc` version 11.4+
+* `nvcc` version 11.8+
+* `cmake` version 3.29.6+
 
-CUDA/GPU:
+CUDA/GPU Runtime:
 
-* CUDA 11.5+
-* NVIDIA driver 450.80.02+
-* Volta architecture or better (Compute Capability >=7.0)
+* CUDA 11.4+
+* Volta architecture or better ([Compute Capability](https://docs.nvidia.com/deploy/cuda-compatibility/) >=7.0)
 
 You can obtain CUDA from
 [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,20 +59,18 @@ void parquet_read_common(cudf::size_type num_rows_to_read,
 }
 
 template <data_type DataType>
-void BM_parquet_read_data(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
+void BM_parquet_read_data_common(nvbench::state& state,
+ data_profile const& profile,
+ nvbench::type_list<nvbench::enum_type<DataType>>)
 {
  auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
- auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
- auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
  auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
  auto const compression = cudf::io::compression_type::SNAPPY;
  cuio_source_sink_pair source_sink(source_type);
 
  auto const num_rows_written = [&]() {
- auto const tbl = create_random_table(
- cycle_dtypes(d_type, num_cols),
- table_size_bytes{data_size},
- data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+ auto const tbl =
+ create_random_table(cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, profile);
  auto const view = tbl->view();
 
  cudf::io::parquet_writer_options write_opts =
@@ -85,6 +83,32 @@ void BM_parquet_read_data(nvbench::state& state, nvbench::type_list<nvbench::enu
  parquet_read_common(num_rows_written, num_cols, source_sink, state);
 }
 
+template <data_type DataType>
+void BM_parquet_read_data(nvbench::state& state,
+ nvbench::type_list<nvbench::enum_type<DataType>> type_list)
+{
+ auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+ auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
+ BM_parquet_read_data_common<DataType>(
+ state, data_profile_builder().cardinality(cardinality).avg_run_length(run_length), type_list);
+}
+
+template <data_type DataType>
+void BM_parquet_read_fixed_width_struct(nvbench::state& state,
+ nvbench::type_list<nvbench::enum_type<DataType>> type_list)
+{
+ auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+ auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
+ std::vector<cudf::type_id> s_types{
+ cudf::type_id::INT32, cudf::type_id::FLOAT32, cudf::type_id::INT64};
+ BM_parquet_read_data_common<DataType>(state,
+ data_profile_builder()
+ .cardinality(cardinality)
+ .avg_run_length(run_length)
+ .struct_types(s_types),
+ type_list);
+}
+
 void BM_parquet_read_io_compression(nvbench::state& state)
 {
  auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
@@ -247,3 +271,13 @@ NVBENCH_BENCH(BM_parquet_read_io_small_mixed)
  .add_int64_axis("cardinality", {0, 1000})
  .add_int64_axis("run_length", {1, 32})
  .add_int64_axis("num_string_cols", {1, 2, 3});
+
+// a benchmark for structs that only contain fixed-width types
+using d_type_list_struct_only = nvbench::enum_type_list<data_type::STRUCT>;
+NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only))
+ .set_name("parquet_read_fixed_width_struct")
+ .set_type_axes_names({"data_type"})
+ .add_string_axis("io_type", {"DEVICE_BUFFER"})
+ .set_min_samples(4)
+ .add_int64_axis("cardinality", {0, 1000})
+ .add_int64_axis("run_length", {1, 32});
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -1,4 +1,4 @@
-# libcudf C++ Developer Guide
+# libcudf C++ Developer Guide {#DEVELOPER_GUIDE}
 
 This document serves as a guide for contributors to libcudf C++ code. Developers should also refer
 to these additional files for further documentation of libcudf best practices.
@@ -469,7 +469,7 @@ libcudf throws under different circumstances, see the [section on error handling
 
 # libcudf API and Implementation
 
-## Streams
+## Streams {#streams}
 
 libcudf is in the process of adding support for asynchronous execution using
 CUDA streams. In order to facilitate the usage of streams, all new libcudf APIs
@@ -486,33 +486,37 @@ use only asynchronous versions of CUDA APIs with the stream parameter.
 
 In order to make the `detail` API callable from other libcudf functions, it should be exposed in a
 header placed in the `cudf/cpp/include/detail/` directory.
+The declaration is not necessary if no other libcudf functions call the `detail` function.
 
 For example:
 
 ```c++
 // cpp/include/cudf/header.hpp
-void external_function(...);
+void external_function(...,
+ rmm::cuda_stream_view stream = cudf::get_default_stream(),
+ rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 // cpp/include/cudf/detail/header.hpp
 namespace detail{
-void external_function(..., rmm::cuda_stream_view stream)
+void external_function(..., rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 } // namespace detail
 
 // cudf/src/implementation.cpp
 namespace detail{
- // Use the stream parameter in the detail implementation.
- void external_function(..., rmm::cuda_stream_view stream){
-  // Implementation uses the stream with async APIs.
-  rmm::device_buffer buff(...,stream);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
-  kernel<<<..., stream>>>(...);
-  thrust::algorithm(rmm::exec_policy(stream), ...);
- }
+// Use the stream parameter in the detail implementation.
+void external_function(..., rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr){
+ // Implementation uses the stream with async APIs.
+ rmm::device_buffer buff(..., stream, mr);
+ CUDF_CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
+ kernel<<<..., stream>>>(...);
+ thrust::algorithm(rmm::exec_policy(stream), ...);
+}
 } // namespace detail
 
-void external_function(...){
- CUDF_FUNC_RANGE(); // Generates an NVTX range for the lifetime of this function.
- detail::external_function(..., cudf::get_default_stream());
+void external_function(..., rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
+{
+ CUDF_FUNC_RANGE(); // Generates an NVTX range for the lifetime of this function.
+ detail::external_function(..., stream, mr);
 }
 ```
 
@@ -703,28 +707,28 @@ The preferred style for how inputs are passed in and outputs are returned is the
  - `column_view const&`
  - Tables:
  - `table_view const&`
-  - Scalar:
-  - `scalar const&`
-  - Everything else:
-  - Trivial or inexpensively copied types
-  - Pass by value
-  - Non-trivial or expensive to copy types
-  - Pass by `const&`
+ - Scalar:
+ - `scalar const&`
+ - Everything else:
+ - Trivial or inexpensively copied types
+ - Pass by value
+ - Non-trivial or expensive to copy types
+ - Pass by `const&`
 - In/Outs
  - Columns:
  - `mutable_column_view&`
  - Tables:
  - `mutable_table_view&`
-  - Everything else:
-  - Pass by via raw pointer
+ - Everything else:
+ - Pass by via raw pointer
 - Outputs
  - Outputs should be *returned*, i.e., no output parameters
  - Columns:
  - `std::unique_ptr<column>`
  - Tables:
  - `std::unique_ptr<table>`
-  - Scalars:
-  - `std::unique_ptr<scalar>`
+ - Scalars:
+ - `std::unique_ptr<scalar>`
 
 
 ### Multiple Return Values
@@ -908,6 +912,10 @@ functions that are specific to columns of Strings. These functions reside in the
 namespace. Similarly, functionality used exclusively for unit testing is in the `cudf::test::`
 namespace.
 
+The public function is expected to contain a call to `CUDF_FUNC_RANGE()` followed by a call to
+a `detail` function with same name and parameters as the public function.
+See the [Streams](#streams) section for an example of this pattern.
+
 ### Internal
 
 Many functions are not meant for public use, so place them in either the `detail` or an *anonymous*

diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
@@ -91,6 +91,56 @@ enum class binary_operator : int32_t {
  ///< (null, false) is null, and (valid, valid) == LOGICAL_OR(valid, valid)
  INVALID_BINARY ///< invalid operation
 };
+
+/// Binary operation common type default
+template <typename L, typename R, typename = void>
+struct binary_op_common_type {};
+
+/// Binary operation common type specialization
+template <typename L, typename R>
+struct binary_op_common_type<L, R, std::enable_if_t<has_common_type_v<L, R>>> {
+ /// The common type of the template parameters
+ using type = std::common_type_t<L, R>;
+};
+
+/// Binary operation common type specialization
+template <typename L, typename R>
+struct binary_op_common_type<
+ L,
+ R,
+ std::enable_if_t<is_fixed_point<L>() && cuda::std::is_floating_point_v<R>>> {
+ /// The common type of the template parameters
+ using type = L;
+};
+
+/// Binary operation common type specialization
+template <typename L, typename R>
+struct binary_op_common_type<
+ L,
+ R,
+ std::enable_if_t<is_fixed_point<R>() && cuda::std::is_floating_point_v<L>>> {
+ /// The common type of the template parameters
+ using type = R;
+};
+
+/// Binary operation common type helper
+template <typename L, typename R>
+using binary_op_common_type_t = typename binary_op_common_type<L, R>::type;
+
+namespace detail {
+template <typename AlwaysVoid, typename L, typename R>
+struct binary_op_has_common_type_impl : std::false_type {};
+
+template <typename L, typename R>
+struct binary_op_has_common_type_impl<std::void_t<binary_op_common_type_t<L, R>>, L, R>
+ : std::true_type {};
+} // namespace detail
+
+/// Checks if binary operation types have a common type
+template <typename L, typename R>
+constexpr inline bool binary_op_has_common_type_v =
+ detail::binary_op_has_common_type_impl<void, L, R>::value;
+
 /**
  * @brief Performs a binary operation between a scalar and a column.
  *

diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
@@ -18,7 +18,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/copy.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -242,8 +242,8 @@ struct scatter_gather_functor {
  rmm::cuda_stream_view stream,
  rmm::device_async_resource_ref mr)
  {
- auto output_column = cudf::detail::allocate_like(
- input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
+ auto output_column =
+ cudf::allocate_like(input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
  auto output = output_column->mutable_view();
 
  bool has_valid = input.nullable();

diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -42,6 +42,9 @@ template <typename Equal>
 struct comparator_adapter {
  comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {}
 
+ // suppress "function was declared but never referenced warning"
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 177
  __device__ constexpr auto operator()(
  cuco::pair<hash_value_type, lhs_index_type> const&,
  cuco::pair<hash_value_type, lhs_index_type> const&) const noexcept
@@ -50,6 +53,14 @@ struct comparator_adapter {
  return false;
  }
 
+ __device__ constexpr auto operator()(
+ cuco::pair<hash_value_type, rhs_index_type> const&,
+ cuco::pair<hash_value_type, rhs_index_type> const&) const noexcept
+ {
+ // All build table keys are distinct thus `false` no matter what
+ return false;
+ }
+
  __device__ constexpr auto operator()(
  cuco::pair<hash_value_type, lhs_index_type> const& lhs,
  cuco::pair<hash_value_type, rhs_index_type> const& rhs) const noexcept
@@ -58,6 +69,15 @@ struct comparator_adapter {
  return _d_equal(lhs.second, rhs.second);
  }
 
+ __device__ constexpr auto operator()(
+ cuco::pair<hash_value_type, rhs_index_type> const& lhs,
+ cuco::pair<hash_value_type, lhs_index_type> const& rhs) const noexcept
+ {
+ if (lhs.first != rhs.first) { return false; }
+ return _d_equal(lhs.second, rhs.second);
+ }
+#pragma nv_diagnostic pop
+
  private:
  Equal _d_equal;
 };
@@ -94,7 +114,7 @@ struct distinct_hash_join {
  using cuco_storage_type = cuco::storage<1>;
 
  /// Hash table type
- using hash_table_type = cuco::static_set<cuco::pair<hash_value_type, lhs_index_type>,
+ using hash_table_type = cuco::static_set<cuco::pair<hash_value_type, rhs_index_type>,
  cuco::extent<size_type>,
  cuda::thread_scope_device,
  comparator_adapter<d_equal_type>,

diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <cudf/detail/copy.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/assert.cuh>
@@ -217,10 +217,9 @@ struct column_gatherer_impl<Element, std::enable_if_t<is_rep_layout_compatible<E
  rmm::cuda_stream_view stream,
  rmm::device_async_resource_ref mr)
  {
- auto const num_rows = cudf::distance(gather_map_begin, gather_map_end);
- auto const policy = cudf::mask_allocation_policy::NEVER;
- auto destination_column =
- cudf::detail::allocate_like(source_column, num_rows, policy, stream, mr);
+ auto const num_rows = cudf::distance(gather_map_begin, gather_map_end);
+ auto const policy = cudf::mask_allocation_policy::NEVER;
+ auto destination_column = cudf::allocate_like(source_column, num_rows, policy, stream, mr);
 
  gather_helper(source_column.data<Element>(),
  source_column.size(),
@@ -413,8 +412,8 @@ struct column_gatherer_impl<dictionary32> {
  auto keys_copy = std::make_unique<column>(dictionary.keys(), stream, mr);
  // Perform gather on just the indices
  column_view indices = dictionary.get_indices_annotated();
- auto new_indices  = cudf::detail::allocate_like(
- indices, output_count, cudf::mask_allocation_policy::NEVER, stream, mr);
+ auto new_indices =
+ cudf::allocate_like(indices, output_count, cudf::mask_allocation_policy::NEVER, stream, mr);
  gather_helper(
  cudf::detail::indexalator_factory::make_input_iterator(indices),
  indices.size(),

diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -88,8 +88,6 @@ std::unique_ptr<table> distinct(table_view const& input,
 
 /**
  * @copydoc cudf::stable_distinct
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> stable_distinct(table_view const& input,
  std::vector<size_type> const& keys,