|
| 1 | +/** |
| 2 | + * traccc library, part of the ACTS project (R&D line) |
| 3 | + * |
| 4 | + * (c) 2024 CERN for the benefit of the ACTS project |
| 5 | + * |
| 6 | + * Mozilla Public License Version 2.0 |
| 7 | + */ |
| 8 | + |
| 9 | +#pragma once |
| 10 | + |
| 11 | +// Project include(s). |
| 12 | +#include "../utils/cuda_error_handling.hpp" |
| 13 | +#include "traccc/cuda/utils/stream.hpp" |
| 14 | +#include "traccc/definitions/concepts.hpp" |
| 15 | + |
| 16 | +// VecMem include(s). |
| 17 | +#include <vecmem/containers/data/vector_view.hpp> |
| 18 | +#include <vecmem/containers/device_vector.hpp> |
| 19 | +#include <vecmem/memory/memory_resource.hpp> |
| 20 | +#include <vecmem/memory/unique_ptr.hpp> |
| 21 | +#include <vecmem/utils/copy.hpp> |
| 22 | + |
| 23 | +// CUDA include |
| 24 | +#include <cuda_runtime.h> |
| 25 | + |
| 26 | +// System include |
| 27 | +#if __cpp_concepts >= 201907L |
| 28 | +#include <concepts> |
| 29 | +#endif |
| 30 | + |
| 31 | +namespace traccc::cuda { |
| 32 | +namespace kernels { |
| 33 | +template <TRACCC_CONSTRAINT(std::semiregular) P, typename T, typename S> |
| 34 | +#if __cpp_concepts >= 201907L |
| 35 | +requires std::regular_invocable<P, T> |
| 36 | +#endif |
| 37 | + __global__ void compress_adjacent(P projection, |
| 38 | + vecmem::data::vector_view<T> _in, S* out, |
| 39 | + uint32_t* out_size) { |
| 40 | + int tid = threadIdx.x + blockIdx.x * blockDim.x; |
| 41 | + |
| 42 | + vecmem::device_vector<T> in(_in); |
| 43 | + |
| 44 | + if (tid > 0 && tid < in.size()) { |
| 45 | + std::invoke_result_t<P, T> v1 = projection(in.at(tid - 1)); |
| 46 | + std::invoke_result_t<P, T> v2 = projection(in.at(tid)); |
| 47 | + |
| 48 | + if (v1 != v2) { |
| 49 | + out[atomicAdd(out_size, 1u)] = v2; |
| 50 | + } |
| 51 | + } else if (tid == 0) { |
| 52 | + out[atomicAdd(out_size, 1u)] = projection(in.at(tid)); |
| 53 | + } |
| 54 | +} |
| 55 | + |
| 56 | +template <TRACCC_CONSTRAINT(std::equality_comparable) T> |
| 57 | +__global__ void all_unique(const T* in, const size_t n, bool* out) { |
| 58 | + int tid_x = threadIdx.x + blockIdx.x * blockDim.x; |
| 59 | + int tid_y = threadIdx.y + blockIdx.y * blockDim.y; |
| 60 | + |
| 61 | + if (tid_x < n && tid_y < n && tid_x != tid_y && in[tid_x] == in[tid_y]) { |
| 62 | + *out = false; |
| 63 | + } |
| 64 | +} |
| 65 | +} // namespace kernels |
| 66 | + |
| 67 | +/** |
| 68 | + * @brief Sanity check that a given vector is contiguous on a given projection. |
| 69 | + * |
| 70 | + * For a vector $v$ to be contiguous on a projection $\pi$, it must be the case |
| 71 | + * that for all indices $i$ and $j$, if $v_i = v_j$, then all indices $k$ |
| 72 | + * between $i$ and $j$, $v_i = v_j = v_k$. |
| 73 | + * |
| 74 | + * @note This function runs in O(n^2) time. |
| 75 | + * |
| 76 | + * @tparam P The type of projection $\pi$, a callable which returns some |
| 77 | + * comparable type. |
| 78 | + * @tparam T The type of the vector. |
| 79 | + * @param projection A projection object of type `P`. |
| 80 | + * @param mr A memory resource used for allocating intermediate memory. |
| 81 | + * @param vector The vector which to check for contiguity. |
| 82 | + * @return true If the vector is contiguous on `P`. |
| 83 | + * @return false Otherwise. |
| 84 | + */ |
| 85 | +template <TRACCC_CONSTRAINT(std::semiregular) P, |
| 86 | + TRACCC_CONSTRAINT(std::equality_comparable) T> |
| 87 | +#if __cpp_concepts >= 201907L |
| 88 | +requires std::regular_invocable<P, T> |
| 89 | +#endif |
| 90 | + bool is_contiguous_on(P&& projection, vecmem::memory_resource& mr, |
| 91 | + vecmem::copy& copy, stream& stream, |
| 92 | + vecmem::data::vector_view<T> vector) { |
| 93 | + // This should never be a performance-critical step, so we can keep the |
| 94 | + // block size fixed. |
| 95 | + constexpr int block_size = 512; |
| 96 | + constexpr int block_size_2d = 32; |
| 97 | + |
| 98 | + cudaStream_t cuda_stream = |
| 99 | + reinterpret_cast<cudaStream_t>(stream.cudaStream()); |
| 100 | + |
| 101 | + // Grab the number of elements in our vector. |
| 102 | + uint32_t n = copy.get_size(vector); |
| 103 | + |
| 104 | + // Get the output type of the projection. |
| 105 | + using projection_t = std::invoke_result_t<P, T>; |
| 106 | + |
| 107 | + // Allocate memory for intermediate values and outputs, then set them up. |
| 108 | + vecmem::unique_alloc_ptr<projection_t[]> iout = |
| 109 | + vecmem::make_unique_alloc<projection_t[]>(mr, n); |
| 110 | + vecmem::unique_alloc_ptr<uint32_t> iout_size = |
| 111 | + vecmem::make_unique_alloc<uint32_t>(mr); |
| 112 | + vecmem::unique_alloc_ptr<bool> out = vecmem::make_unique_alloc<bool>(mr); |
| 113 | + |
| 114 | + uint32_t initial_iout_size = 0; |
| 115 | + bool initial_out = true; |
| 116 | + |
| 117 | + TRACCC_CUDA_ERROR_CHECK( |
| 118 | + cudaMemcpyAsync(iout_size.get(), &initial_iout_size, sizeof(uint32_t), |
| 119 | + cudaMemcpyHostToDevice, cuda_stream)); |
| 120 | + TRACCC_CUDA_ERROR_CHECK( |
| 121 | + cudaMemcpyAsync(out.get(), &initial_out, sizeof(bool), |
| 122 | + cudaMemcpyHostToDevice, cuda_stream)); |
| 123 | + |
| 124 | + // Launch the first kernel, which will squash consecutive equal elements |
| 125 | + // into one element. |
| 126 | + kernels::compress_adjacent<P, T, projection_t> |
| 127 | + <<<(n + block_size - 1) / block_size, block_size, 0, cuda_stream>>>( |
| 128 | + projection, vector, iout.get(), iout_size.get()); |
| 129 | + |
| 130 | + TRACCC_CUDA_ERROR_CHECK(cudaGetLastError()); |
| 131 | + |
| 132 | + // Copy the total number of squashed elements, e.g. the size of the |
| 133 | + // resulting vector. |
| 134 | + uint32_t host_iout_size; |
| 135 | + |
| 136 | + TRACCC_CUDA_ERROR_CHECK( |
| 137 | + cudaMemcpyAsync(&host_iout_size, iout_size.get(), sizeof(uint32_t), |
| 138 | + cudaMemcpyDeviceToHost, cuda_stream)); |
| 139 | + |
| 140 | + // Launch the second kernel, which will check if the values are unique. |
| 141 | + uint32_t grid_size_rd = |
| 142 | + (host_iout_size + block_size_2d - 1) / block_size_2d; |
| 143 | + dim3 all_unique_grid_size(grid_size_rd, grid_size_rd); |
| 144 | + dim3 all_unique_block_size(block_size_2d, block_size_2d); |
| 145 | + |
| 146 | + kernels::all_unique<<<all_unique_grid_size, all_unique_block_size, 0, |
| 147 | + cuda_stream>>>(iout.get(), host_iout_size, out.get()); |
| 148 | + |
| 149 | + TRACCC_CUDA_ERROR_CHECK(cudaGetLastError()); |
| 150 | + |
| 151 | + // Get the result from the device and return it. |
| 152 | + bool host_out; |
| 153 | + |
| 154 | + TRACCC_CUDA_ERROR_CHECK(cudaMemcpyAsync(&host_out, out.get(), sizeof(bool), |
| 155 | + cudaMemcpyDeviceToHost, |
| 156 | + cuda_stream)); |
| 157 | + |
| 158 | + stream.synchronize(); |
| 159 | + |
| 160 | + return host_out; |
| 161 | +} |
| 162 | +} // namespace traccc::cuda |
0 commit comments