Add a benchmark

ttnghia · May 25, 2022 · 031302b · 031302b
1 parent 7bd714d
commit 031302b
Showing 1 changed file with 114 additions and 53 deletions.
diff --git a/cpp/benchmarks/groupby/group_struct.cu b/cpp/benchmarks/groupby/group_struct.cu
@@ -18,78 +18,139 @@
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
-#include <cudf/aggregation.hpp>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/types.hpp>
 
-static constexpr cudf::size_type num_struct_members = 8;
-static constexpr cudf::size_type max_int            = 100;
-static constexpr cudf::size_type max_str_length     = 32;
+#include <thrust/binary_search.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/scatter.h>
+#include <thrust/uninitialized_fill.h>
 
-static auto create_data_table(cudf::size_type n_rows)
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+//==================================================================================================
+auto create_offsets(cudf::size_type n_groups, rmm::cuda_stream_view stream)
 {
-  data_profile table_profile;
+  // This is the maximum size of each group.
+  constexpr cudf::size_type max_int = 1000;
+
+  auto table_profile = data_profile{};
   table_profile.set_distribution_params(cudf::type_id::INT32, distribution_id::UNIFORM, 0, max_int);
-  table_profile.set_distribution_params(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-
-  // The first two struct members are int32 and string.
-  // The first column is also used as keys in groupby.
-  // The subsequent struct members are int32 and string again.
-  return create_random_table(
-    cycle_dtypes({cudf::type_id::INT32, cudf::type_id::STRING}, num_struct_members),
-    row_count{n_rows},
-    table_profile);
+  auto sizes =
+    std::move(create_random_table({cudf::type_id::INT32}, row_count{n_groups}, table_profile)
+                ->release()
+                .front());
+  auto const sizes_view = sizes->mutable_view();
+
+  thrust::exclusive_scan(rmm::exec_policy(),
+                         sizes_view.template begin<cudf::size_type>(),
+                         sizes_view.template end<cudf::size_type>(),
+                         sizes_view.template begin<cudf::size_type>());
+
+  cudf::size_type n_elements;
+  CUDF_CUDA_TRY(cudaMemcpyAsync(&n_elements,
+                                sizes_view.template end<cudf::size_type>() - 1,
+                                sizeof(cudf::size_type),
+                                cudaMemcpyDeviceToHost,
+                                stream.value()));
+  stream.synchronize();
+
+  return std::pair{std::move(sizes), n_elements};
+}
+
+//==================================================================================================
+template <typename InputIterator, typename OutputIterator>
+void old_way(InputIterator offsets_begin,
+             InputIterator offsets_end,
+             OutputIterator out_begin,
+             OutputIterator out_end,
+             rmm::cuda_stream_view stream)
+{
+  thrust::uninitialized_fill(rmm::exec_policy(stream), out_begin, out_end, cudf::size_type{0});
+  thrust::scatter(
+    rmm::exec_policy(stream),
+    thrust::make_constant_iterator(1, 1),
+    thrust::make_constant_iterator(
+      1, static_cast<cudf::size_type>(thrust::distance(offsets_begin, offsets_end)) - 1),
+    offsets_begin + 1,
+    out_begin);
+  thrust::inclusive_scan(rmm::exec_policy(stream), out_begin, out_end, out_begin);
 }
 
-// Max aggregation/scan technically has the same performance as min.
-template <typename OpType>
-void BM_groupby_min_struct(benchmark::State& state)
+//==================================================================================================
+template <typename InputIterator, typename OutputIterator>
+void new_way(InputIterator offsets_begin,
+             InputIterator offsets_end,
+             OutputIterator out_begin,
+             OutputIterator out_end,
+             rmm::cuda_stream_view stream)
 {
-  auto const n_rows = static_cast<cudf::size_type>(state.range(0));
-  auto data_cols    = create_data_table(n_rows)->release();
+  auto const zero_normalized_offsets = thrust::make_transform_iterator(
+    offsets_begin, [offsets_begin] __device__(auto const idx) { return idx - *offsets_begin; });
+
+  // The output labels from `upper_bound` will start from `1`.
+  // This will shift the result values back to start from `0`.
+  using OutputType  = typename thrust::iterator_value<OutputIterator>::type;
+  auto const output = thrust::make_transform_output_iterator(
+    out_begin, [] __device__(auto const idx) { return idx - OutputType{1}; });
 
-  auto const keys_view = data_cols.front()->view();
-  auto const values =
-    cudf::make_structs_column(keys_view.size(), std::move(data_cols), 0, rmm::device_buffer());
+  thrust::upper_bound(rmm::exec_policy(stream),
+                      zero_normalized_offsets,
+                      zero_normalized_offsets + thrust::distance(offsets_begin, offsets_end),
+                      thrust::make_counting_iterator<OutputType>(0),
+                      thrust::make_counting_iterator<OutputType>(
+                        static_cast<OutputType>(thrust::distance(out_begin, out_end))),
+                      output);
+}
 
-  using RequestType = std::conditional_t<std::is_same_v<OpType, cudf::groupby_aggregation>,
-                                         cudf::groupby::aggregation_request,
-                                         cudf::groupby::scan_request>;
+//==================================================================================================
+template <bool use_old>
+void BM_labeling(benchmark::State& state)
+{
+  auto const n_groups = static_cast<cudf::size_type>(state.range(0));
+  auto const stream   = rmm::cuda_stream_default;
 
-  auto gb_obj   = cudf::groupby::groupby(cudf::table_view({keys_view}));
-  auto requests = std::vector<RequestType>();
-  requests.emplace_back(RequestType());
-  requests.front().values = values->view();
-  requests.front().aggregations.push_back(cudf::make_min_aggregation<OpType>());
+  auto const [offsets, n_labels] = create_offsets(n_groups, stream);
+  auto const offsets_view        = offsets->view();
+  auto labels                    = rmm::device_uvector<cudf::size_type>(n_labels, stream);
 
   for (auto _ : state) {
     [[maybe_unused]] auto const timer = cuda_event_timer(state, true);
-    if constexpr (std::is_same_v<OpType, cudf::groupby_aggregation>) {
-      [[maybe_unused]] auto const result = gb_obj.aggregate(requests);
+    if constexpr (use_old) {
+      old_way(offsets_view.template begin<cudf::size_type>(),
+              offsets_view.template end<cudf::size_type>(),
+              labels.begin(),
+              labels.end(),
+              stream);
     } else {
-      [[maybe_unused]] auto const result = gb_obj.scan(requests);
+      new_way(offsets_view.template begin<cudf::size_type>(),
+              offsets_view.template end<cudf::size_type>(),
+              labels.begin(),
+              labels.end(),
+              stream);
     }
   }
 }
 
-class Groupby : public cudf::benchmark {
+//==================================================================================================
+class Labeling : public cudf::benchmark {
 };
 
-#define MIN_RANGE 10'000
-#define MAX_RANGE 10'000'000
-
-#define REGISTER_BENCHMARK(name, op_type)                       \
-  BENCHMARK_DEFINE_F(Groupby, name)(::benchmark::State & state) \
-  {                                                             \
-    BM_groupby_min_struct<op_type>(state);                      \
-  }                                                             \
-  BENCHMARK_REGISTER_F(Groupby, name)                           \
-    ->UseManualTime()                                           \
-    ->Unit(benchmark::kMillisecond)                             \
-    ->RangeMultiplier(4)                                        \
+#define MIN_RANGE 1'000
+#define MAX_RANGE 4'200'000
+
+#define REGISTER_BENCHMARK(name, use_old)                                                         \
+  BENCHMARK_DEFINE_F(Labeling, name)(::benchmark::State & state) { BM_labeling<use_old>(state); } \
+  BENCHMARK_REGISTER_F(Labeling, name)                                                            \
+    ->UseManualTime()                                                                             \
+    ->Unit(benchmark::kMillisecond)                                                               \
+    ->RangeMultiplier(4)                                                                          \
     ->Ranges({{MIN_RANGE, MAX_RANGE}});
 
-REGISTER_BENCHMARK(Aggregation, cudf::groupby_aggregation)
-REGISTER_BENCHMARK(Scan, cudf::groupby_scan_aggregation)
+REGISTER_BENCHMARK(LabelingOldWay, true)
+// REGISTER_BENCHMARK(LabelingNewWay, false)