rapidsai · ahendriksen · May 5, 2023 · May 5, 2023 · May 5, 2023 · May 5, 2023
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Tuning benchmarks.
+//
+// Goals:
+//
+// 1. Fast compile times to maintain iteration speed.
+// 2. Create benchmarks that can inform the design of the kernels.
+//
+// Non-goals:
+//
+// 1. Measure every distance operation. Instead measures just one distance
+// operation at the same time.
+// 2. Be useful for finding performance regressions. This is handled by the
+// normal benchmarks.
+//
+// So far, both goals are partly achieved.
+//
+// RE (1), COMPILE TIMES: kernel.cu is fast to compile. This file is not.
+// When the internals of a pairwise distance kernel is changed, this file is not
+// recompiled.
+//
+// RE 2, benchmarks with intent: this file contains a benchmark to check the
+// maximal throughput of a kernel. Measuring other things, like performance on
+// skinny or wide matrices is not yet implemented.
+
+#include "kernel_cutlass.cuh" // launch_kernel
+#include <algorithm> // std::min
+#include <common/benchmark.hpp> // RAFT_BENCH_REGISTER
+#include <raft/distance/detail/pairwise_matrix/params.cuh> // pairwise_matrix_params
+#include <rmm/device_uvector.hpp> // rmm::device_uvector
+#include <vector> // std::vector
+
+namespace raft::bench::distance::tune_cutlass {
+
+// Max throughput benchmark.
+//
+// Goal: Measure the maximum distances/sec that can be computed.
+//
+// To achieve this, we make sure that:
+//
+// - Input data size is a multiple of the block tile size.
+//
+// - Perfect distribution of work between SMs, i.e. the number of block tiles is
+// a large multiple (num_waves) of the number of blocks (#SMs * occupancy).
+//
+// - Multiple iterations over Kblk are executed (num_k_iters).
+struct throughput_param {
+ int m, n, k;
+ bool use_1x_tfloat;
+};
+
+const std::vector<throughput_param> throughput_params{
+ {1024, 1024, 1024, true},
+ {1024, 1024, 1 << 11, true},
+ {1024, 1024, 1 << 12, true},
+ {1024, 1024, 1 << 13, true},
+ {1024, 1 << 14, 1024, true},
+ {1024, 1 << 14, 1 << 11, true},
+ {1024, 1 << 14, 1 << 12, true},
+ {1024, 1 << 14, 1 << 13, true},
+
+ {1024, 1024, 1024, false},
+ {1024, 1024, 1 << 11, false},
+ {1024, 1024, 1 << 12, false},
+ {1024, 1024, 1 << 13, false},
+ {1024, 1 << 14, 1024, false},
+ {1024, 1 << 14, 1 << 11, false},
+ {1024, 1 << 14, 1 << 12, false},
+ {1024, 1 << 14, 1 << 13, false},
+};
+
+struct throughput_cutlass : public fixture {
+ const throughput_param p;
+
+ throughput_cutlass(const throughput_param& p_) : p(p_) {}
+
+ void run_benchmark(::benchmark::State& state) override
+ {
+ size_t m = p.m;
+ size_t n = p.n;
+ size_t k = p.k;
+
+ // DataT, OutT, IdxT, etc, are defined in tuned_kernel.cuh
+ rmm::device_uvector<DataT> x_vec(m * k, stream);
+ rmm::device_uvector<DataT> y_vec(n * k, stream);
+ rmm::device_uvector<DataT> x_norm_vec(m, stream);
+ rmm::device_uvector<DataT> y_norm_vec(n, stream);
+ rmm::device_uvector<OutT> out_vec(m * n, stream);
+
+ auto x = x_vec.data();
+ auto y = y_vec.data();
+ auto x_norm = x_norm_vec.data();
+ auto y_norm = y_norm_vec.data();
+ auto out = out_vec.data();
+ FinOpT fin_op{};
+
+ // Create kernel parameter struct. Flip x and y if column major.
+ IdxT ldx = row_major ? k : m;
+ IdxT ldy = row_major ? k : n;
+ IdxT ld_out = row_major ? n : m;
+
+ // Template parameters of pairwise_matrix_params are defined in kernel.cuh
+ pairwise_matrix_params kparams{
+ IdxT(m), IdxT(n), IdxT(k), ldx, ldy, ld_out, x, y, x_norm, y_norm, out, fin_op, row_major};
+
+ // Run benchmark
+ loop_on_state(state, [&]() { launch_kernel(kparams, p.use_1x_tfloat, stream); });
+
+ // Report metrics. We don't report flop/s because we do not know for each
+ // distance operation how many flops it costs. For L2_unexp and l1, we can
+ // double this number to get the flop/s. For l2 expanded, core_ops/s should
+ // equal flop/s (modulo the sqrt and subtracting from the norm).
- // double this number to get the flop/s. For l2 expanded, core_ops/s should
- // equal flop/s (modulo the sqrt and subtracting from the norm).
+ // double this number to get the flop/s. For l2 expanded, 2*core_ops/s should
+ // equal flop/s (ignoring the sqrt and subtracting from the norm).
- // double this number to get the flop/s. For l2 expanded, core_ops/s should
- // equal flop/s (modulo the sqrt and subtracting from the norm).
+ // double this number to get the flop/s. For l2 expanded, 2*core_ops/s should
+ // equal flop/s (ignoring the sqrt and subtracting from the norm).
+ size_t num_core_ops = m * n * k;
+ size_t read_elts = n * k + m * k;
+ size_t write_elts = m * n;
+
+ state.counters["m"] = benchmark::Counter(m);
+ state.counters["n"] = benchmark::Counter(n);
+ state.counters["k"] = benchmark::Counter(k);
+ state.counters["1xtfloat"] = benchmark::Counter(p.use_1x_tfloat);
+
+ state.counters["core_ops/s"] = benchmark::Counter(num_core_ops,
+ benchmark::Counter::kIsIterationInvariantRate,
+ benchmark::Counter::OneK::kIs1000);
+
+ state.counters["BW"] = benchmark::Counter(write_elts * sizeof(OutT) + read_elts * sizeof(DataT),
+ benchmark::Counter::kIsIterationInvariantRate,
+ benchmark::Counter::OneK::kIs1000);
+ }
+};
+
+RAFT_BENCH_REGISTER(throughput_cutlass, "", throughput_params);
+
+} // namespace raft::bench::distance::tune_cutlass
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_cutlass.cuh"
+#include <raft/distance/detail/distance_ops/all_ops.cuh> // distance_op
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
+#include <raft/distance/detail/pairwise_matrix/params.cuh>
+#include <raft/distance/distance_types.hpp> // Compute_options
+#include <raft/util/arch.cuh> // raft::util::arch::SM_compute_arch
+
+namespace raft::bench::distance::tune_cutlass {
+
+// Distance op
+using OpT = raft::distance::detail::ops::l2_exp_distance_op<DataT, AccT, IdxT>;
+
+constexpr bool perform_sqrt = false;
+OpT distance_op{perform_sqrt};
+
+// Architecture
+namespace arch = raft::util::arch;
+constexpr auto sm_compat_range = arch::SM_range(arch::SM_80(), arch::SM_future());
+
+void launch_kernel(pairwise_matrix_params params, bool use_1x_tfloat, cudaStream_t stream)
+{
+ raft::distance::detail::pairwise_matrix_sm80_dispatch(
+ distance_op,
+ use_1x_tfloat ? raft::distance::Compute_options::Fast_Reduced_Precision
+ : raft::distance::Compute_options::Fast_Similar_Precision,
+ params,
+ sm_compat_range,
+ stream);
+ RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+} // namespace raft::bench::distance::tune_cutlass
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/operators.hpp> // raft::identity_op
+#include <raft/distance/detail/pairwise_matrix/params.cuh> // pairwise_matrix_params
+
+namespace raft::bench::distance::tune_cutlass {
+
+// Launch one specific kernel with the following template parameters
+constexpr bool row_major = true;
+using DataT = float;
+using AccT = float;
+using OutT = DataT;
+using IdxT = int;
+
+using FinOpT = raft::identity_op;
+
+using pairwise_matrix_params =
+ raft::distance::detail::pairwise_matrix_params<IdxT, DataT, OutT, FinOpT>;
+
+void launch_kernel(pairwise_matrix_params params, bool use_1x_tfloat, cudaStream_t stream);
+
+} // namespace raft::bench::distance::tune_cutlass