Reimplement cudf::merge for nested types without using comparators (#…

…14250) Part of #11844 This PR also uses new experimental comparators for non-nested types by introducing a new device constructor for `cudf::experimental::row::lexicographic::device_row_comparator`. In the case of non-nested types, preprocessing can be skipped so comparators can be created on the fly. This solution helps us avoid creating 3 comparator types because `thrust::merge` can call the operator with indices from either side of the table. Furthermore, the PR reworks `cudf/detail/merge.cuh` by removing any CUDA headers/components to expose a true detail API of the form `cudf/detail/merge.hpp`. [Benchmark comparison for non-nested types](#14250 (comment)) Compilation time increases from ~6 mins to ~7 mins. Authors: - Divye Gala (https://github.com/divyegala) Approvers: - Bradley Dice (https://github.com/bdice) - MithunR (https://github.com/mythrocks) URL: #14250
rapidsai · Oct 28, 2023 · 2bc454a · 2bc454a
1 parent 751370e
commit 2bc454a
Show file tree

Hide file tree

Showing 17 changed files with 525 additions and 186 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -230,6 +230,7 @@ ConfigureNVBench(HASHING_NVBENCH hashing/hash.cpp)
 # ##################################################################################################
 # * merge benchmark -------------------------------------------------------------------------------
 ConfigureBench(MERGE_BENCH merge/merge.cpp)
+ConfigureNVBench(MERGE_NVBENCH merge/merge_structs.cpp merge/merge_lists.cpp)
 
 # ##################################################################################################
 # * null_mask benchmark ---------------------------------------------------------------------------

diff --git a/cpp/benchmarks/sort/nested_types_common.hpp → ...nchmarks/common/generate_nested_types.hpp b/cpp/benchmarks/sort/nested_types_common.hpp → ...nchmarks/common/generate_nested_types.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <benchmarks/common/generate_input.hpp>
+#include "generate_input.hpp"
 
 #include <cudf_test/column_wrapper.hpp>
 

diff --git a/cpp/benchmarks/merge/merge_lists.cpp b/cpp/benchmarks/merge/merge_lists.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_nested_types.hpp>
+
+#include <cudf/detail/merge.hpp>
+#include <cudf/detail/sorting.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+void nvbench_merge_list(nvbench::state& state)
+{
+  rmm::cuda_stream_view stream;
+
+  auto const input1 = create_lists_data(state);
+  auto const sorted_input1 =
+    cudf::detail::sort(*input1, {}, {}, stream, rmm::mr::get_current_device_resource());
+
+  auto const input2 = create_lists_data(state);
+  auto const sorted_input2 =
+    cudf::detail::sort(*input2, {}, {}, stream, rmm::mr::get_current_device_resource());
+
+  stream.synchronize();
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+
+    cudf::detail::merge({*sorted_input1, *sorted_input2},
+                        {0},
+                        {cudf::order::ASCENDING},
+                        {},
+                        stream_view,
+                        rmm::mr::get_current_device_resource());
+  });
+}
+
+NVBENCH_BENCH(nvbench_merge_list)
+  .set_name("merge_lists")
+  .add_int64_power_of_two_axis("size_bytes", {10, 18, 24, 28})
+  .add_int64_axis("depth", {1, 4})
+  .add_float64_axis("null_frequency", {0, 0.2});
diff --git a/cpp/benchmarks/merge/merge_structs.cpp b/cpp/benchmarks/merge/merge_structs.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_nested_types.hpp>
+
+#include <cudf/detail/merge.hpp>
+#include <cudf/detail/sorting.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+void nvbench_merge_struct(nvbench::state& state)
+{
+  rmm::cuda_stream_view stream;
+
+  auto const input1 = create_structs_data(state);
+  auto const sorted_input1 =
+    cudf::detail::sort(*input1, {}, {}, stream, rmm::mr::get_current_device_resource());
+
+  auto const input2 = create_structs_data(state);
+  auto const sorted_input2 =
+    cudf::detail::sort(*input2, {}, {}, stream, rmm::mr::get_current_device_resource());
+
+  stream.synchronize();
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+
+    cudf::detail::merge({*sorted_input1, *sorted_input2},
+                        {0},
+                        {cudf::order::ASCENDING},
+                        {},
+                        stream_view,
+                        rmm::mr::get_current_device_resource());
+  });
+}
+
+NVBENCH_BENCH(nvbench_merge_struct)
+  .set_name("merge_struct")
+  .add_int64_power_of_two_axis("NumRows", {10, 18, 26})
+  .add_int64_axis("Depth", {0, 1, 8})
+  .add_int64_axis("Nulls", {0, 1});
diff --git a/cpp/benchmarks/sort/rank_lists.cpp b/cpp/benchmarks/sort/rank_lists.cpp
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#include "nested_types_common.hpp"
 #include "rank_types_common.hpp"
 
+#include <benchmarks/common/generate_nested_types.hpp>
+
 #include <cudf/sorting.hpp>
 
 #include <cudf_test/column_utilities.hpp>

diff --git a/cpp/benchmarks/sort/rank_structs.cpp b/cpp/benchmarks/sort/rank_structs.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "nested_types_common.hpp"
 #include "rank_types_common.hpp"
+#include <benchmarks/common/generate_nested_types.hpp>
 
 #include <cudf/sorting.hpp>
 

diff --git a/cpp/benchmarks/sort/sort_lists.cpp b/cpp/benchmarks/sort/sort_lists.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "nested_types_common.hpp"
+#include <benchmarks/common/generate_nested_types.hpp>
 
 #include <cudf/detail/sorting.hpp>
 

diff --git a/cpp/benchmarks/sort/sort_structs.cpp b/cpp/benchmarks/sort/sort_structs.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "nested_types_common.hpp"
+#include <benchmarks/common/generate_nested_types.hpp>
 
 #include <cudf/detail/sorting.hpp>
 

diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh
diff --git a/cpp/include/cudf/detail/merge.hpp b/cpp/include/cudf/detail/merge.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/pair.h>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Source table identifier to copy data from.
+ */
+enum class side : bool { LEFT, RIGHT };
+
+/**
+ * @brief Tagged index type: `thrust::get<0>` indicates left/right side,
+ * `thrust::get<1>` indicates the row index
+ */
+using index_type = thrust::pair<side, cudf::size_type>;
+
+/**
+ * @brief Vector of `index_type` values.
+ */
+using index_vector = rmm::device_uvector<index_type>;
+
+/**
+ * @copydoc std::unique_ptr<cudf::table> merge(
+ *            std::vector<table_view> const& tables_to_merge,
+ *            std::vector<cudf::size_type> const& key_cols,
+ *            std::vector<cudf::order> const& column_order,
+ *            std::vector<cudf::null_order> const& null_precedence,
+ *            rmm::mr::device_memory_resource* mr)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merge,
+                                   std::vector<cudf::size_type> const& key_cols,
+                                   std::vector<cudf::order> const& column_order,
+                                   std::vector<cudf::null_order> const& null_precedence,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace cudf