diff --git a/device/alpaka/src/clusterization/clusterization_algorithm.cpp b/device/alpaka/src/clusterization/clusterization_algorithm.cpp index 6e5f116bc9..89617fa39b 100644 --- a/device/alpaka/src/clusterization/clusterization_algorithm.cpp +++ b/device/alpaka/src/clusterization/clusterization_algorithm.cpp @@ -38,11 +38,8 @@ struct CCLKernel { traccc::alpaka::thread_id1 thread_id(acc); - auto& partition_start = - ::alpaka::declareSharedVar(acc); - auto& partition_end = - ::alpaka::declareSharedVar(acc); - auto& outi = ::alpaka::declareSharedVar(acc); + auto& shared = ::alpaka::declareSharedVar< + device::details::ccl_kernel_static_smem_parcel, __COUNTER__>(acc); device::details::index_t* const shared_v = ::alpaka::getDynSharedMem(acc); @@ -56,9 +53,8 @@ struct CCLKernel { alpaka::barrier barry_r(&acc); - device::ccl_kernel(cfg, thread_id, cells_view, modules_view, - partition_start, partition_end, outi, f_view, - gf_view, f_backup_view, gf_backup_view, + device::ccl_kernel(cfg, thread_id, cells_view, modules_view, shared, + f_view, gf_view, f_backup_view, gf_backup_view, adjc_backup_view, adjv_backup_view, backup_mutex, barry_r, measurements_view, cell_links); } diff --git a/device/common/include/traccc/clusterization/device/ccl_kernel.hpp b/device/common/include/traccc/clusterization/device/ccl_kernel.hpp index 58d709ee42..f9756b01aa 100644 --- a/device/common/include/traccc/clusterization/device/ccl_kernel.hpp +++ b/device/common/include/traccc/clusterization/device/ccl_kernel.hpp @@ -26,6 +26,13 @@ #include namespace traccc::device { +namespace details { +struct ccl_kernel_static_smem_parcel { + std::size_t partition_start; + std::size_t partition_end; + uint32_t outi; +}; +} // namespace details /// Function which reads raw detector cells and turns them into measurements. /// @@ -59,7 +66,7 @@ TRACCC_DEVICE inline void ccl_kernel( const clustering_config cfg, const thread_id_t& thread_id, const cell_collection_types::const_view cells_view, const cell_module_collection_types::const_view modules_view, - std::size_t& partition_start, std::size_t& partition_end, std::size_t& outi, + details::ccl_kernel_static_smem_parcel& smem, vecmem::data::vector_view f_view, vecmem::data::vector_view gf_view, vecmem::data::vector_view f_backup_view, diff --git a/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp b/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp index 773261b344..b5bb18aadb 100644 --- a/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp +++ b/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp @@ -11,17 +11,19 @@ #include "traccc/clusterization/clustering_config.hpp" #include "traccc/clusterization/device/aggregate_cluster.hpp" +#include "traccc/clusterization/device/ccl_kernel.hpp" #include "traccc/clusterization/device/ccl_kernel_definitions.hpp" #include "traccc/clusterization/device/reduce_problem_cell.hpp" #include "traccc/device/concepts/barrier.hpp" #include "traccc/device/concepts/thread_id.hpp" #include "traccc/device/mutex.hpp" +#include "traccc/device/sort.hpp" #include "traccc/device/unique_lock.hpp" #include "traccc/edm/cell.hpp" +#include "traccc/edm/measurement.hpp" #include "vecmem/memory/device_atomic_ref.hpp" namespace traccc::device { - /// Implementation of a FastSV algorithm with the following steps: /// 1) mix of stochastic and aggressive hooking /// 2) shortcutting @@ -136,15 +138,15 @@ TRACCC_DEVICE void fast_sv_1(const thread_id_t& thread_id, template TRACCC_DEVICE inline void ccl_core( - const thread_id_t& thread_id, std::size_t& partition_start, - std::size_t& partition_end, vecmem::device_vector f, + const thread_id_t& thread_id, details::ccl_kernel_static_smem_parcel& smem, + vecmem::device_vector f, vecmem::device_vector gf, vecmem::data::vector_view cell_links, details::index_t* adjv, unsigned char* adjc, const cell_collection_types::const_device cells_device, const cell_module_collection_types::const_device modules_device, measurement_collection_types::device measurements_device, barrier_t& barrier) { - const details::index_t size = partition_end - partition_start; + const details::index_t size = smem.partition_end - smem.partition_start; assert(size <= f.size()); assert(size <= gf.size()); @@ -160,8 +162,8 @@ TRACCC_DEVICE inline void ccl_core( const details::index_t cid = tst * thread_id.getBlockDimX() + thread_id.getLocalThreadIdX(); adjc[tst] = 0; - reduce_problem_cell(cells_device, cid, partition_start, partition_end, - adjc[tst], &adjv[8 * tst]); + reduce_problem_cell(cells_device, cid, smem.partition_start, + smem.partition_end, adjc[tst], &adjv[8 * tst]); } for (details::index_t tst = 0; tst < thread_cell_count; ++tst) { @@ -189,20 +191,96 @@ TRACCC_DEVICE inline void ccl_core( barrier.blockBarrier(); + /* + * We will now repurpose the `gf` shared vector to store the cells which + * are parents. For the time being, `outi` will be the size of `gf`. + */ + if (thread_id.getLocalThreadIdX() == 0) { + smem.outi = 0; + } + + barrier.blockBarrier(); + + /* + * We now collect the parents into the `gf` array. + */ for (details::index_t tst = 0; tst < thread_cell_count; ++tst) { const details::index_t cid = tst * thread_id.getBlockDimX() + thread_id.getLocalThreadIdX(); + if (f.at(cid) == cid) { - // Add a new measurement to the output buffer. Remembering its - // position inside of the container. - const measurement_collection_types::device::size_type meas_pos = - measurements_device.push_back({}); - // Set up the measurement under the appropriate index. - aggregate_cluster( - cells_device, modules_device, f, partition_start, partition_end, - cid, measurements_device.at(meas_pos), cell_links, meas_pos); + vecmem::device_atomic_ref + atom(smem.outi); + gf.at(atom.fetch_add(1)) = cid; + } + } + + barrier.blockBarrier(); + + uint32_t n_measurements = smem.outi; + + barrier.blockBarrier(); + + /* + * The lead threat resizes the measurement vector to contain the total + * number of measurements. The `outi` variable is now repurposed to contain + * the start of this block's region in the output array. + */ + if (thread_id.getLocalThreadIdX() == 0) { + smem.outi = measurements_device.bulk_append_implicit(n_measurements); + } + + barrier.blockBarrier(); + + /* + * Sort the measurements so that the modules are ordered, which guarantees + * that they will be contiguous. + * + * TODO: Can this be done more cleverly? Perhaps while collecting the + * elements into `gf`. + */ + blockOddEvenSort( + thread_id, barrier, gf.data(), n_measurements, + [&](const unsigned short lhs, const unsigned short rhs) { + return cells_device.at(smem.partition_start + f.at(lhs)) + .module_link < + cells_device.at(smem.partition_start + f.at(rhs)) + .module_link; + }); + + barrier.blockBarrier(); + + /* + * Aggregate the clusters and write them to global memory. + */ + for (uint32_t i = thread_id.getLocalThreadIdX(); i < n_measurements; + i += thread_id.getBlockDimX()) { + uint32_t meas_pos = smem.outi + i; + // Set up the measurement under the appropriate index. + aggregate_cluster(cells_device, modules_device, f, smem.partition_start, + smem.partition_end, gf.at(i), + measurements_device.at(meas_pos), cell_links, + meas_pos); + } +} + +TRACCC_DEVICE inline std::size_t ccl_partition_find_helper( + const cell_collection_types::const_device cells, std::size_t begin, + std::size_t target_end) { + bool found_good = false; + std::size_t last_good; + + for (std::size_t i = begin; i < target_end || !found_good; ++i) { + if (i == cells.size() || + cells.at(i - 1).module_link != cells.at(i).module_link) { + found_good = true; + last_good = i; } } + + assert(found_good); + return last_good; } template f_view, vecmem::data::vector_view gf_view, vecmem::data::vector_view f_backup_view, @@ -236,8 +314,6 @@ TRACCC_DEVICE inline void ccl_kernel( mutex mutex(backup_mutex); unique_lock lock(mutex, std::defer_lock); - const std::size_t num_cells = cells_device.size(); - /* * First, we determine the exact range of cells that is to be examined * by this block of threads. We start from an initial range determined @@ -247,12 +323,19 @@ TRACCC_DEVICE inline void ccl_kernel( * amounts. */ if (thread_id.getLocalThreadIdX() == 0) { - std::size_t start = - thread_id.getBlockIdX() * cfg.target_partition_size(); - assert(start < num_cells); - std::size_t end = - std::min(num_cells, start + cfg.target_partition_size()); - outi = 0; + const std::size_t num_cells = cells_device.size(); + + smem.outi = 0; + + std::size_t boundary_beg = + std::max(1lu, std::min(num_cells, thread_id.getBlockIdX() * + cfg.target_partition_size())); + std::size_t boundary_mid = + std::min(num_cells, (thread_id.getBlockIdX() + 1) * + cfg.target_partition_size()); + std::size_t boundary_end = + std::min(num_cells, (thread_id.getBlockIdX() + 2) * + cfg.target_partition_size()); /* * Next, shift the starting point to a position further in the @@ -260,12 +343,11 @@ TRACCC_DEVICE inline void ccl_kernel( * on any cells that have been claimed by the previous block (if * any). */ - while (start != 0 && - cells_device[start - 1].module_link == - cells_device[start].module_link && - cells_device[start].channel1 <= - cells_device[start - 1].channel1 + 1) { - ++start; + if (thread_id.getBlockIdX() == 0) { + smem.partition_start = 0; + } else { + smem.partition_start = ccl_partition_find_helper( + cells_device, boundary_beg, boundary_mid); } /* @@ -273,16 +355,10 @@ TRACCC_DEVICE inline void ccl_kernel( * current block to ensure that we do not end our partition on a * cell that is not a possible boundary! */ - while (end < num_cells && - cells_device[end - 1].module_link == - cells_device[end].module_link && - cells_device[end].channel1 <= - cells_device[end - 1].channel1 + 1) { - ++end; - } - partition_start = start; - partition_end = end; - assert(partition_start <= partition_end); + smem.partition_end = + ccl_partition_find_helper(cells_device, boundary_mid, boundary_end); + + assert(smem.partition_start <= smem.partition_end); } barrier.blockBarrier(); @@ -303,7 +379,7 @@ TRACCC_DEVICE inline void ccl_kernel( // into a return. As such, we cannot use returns in this kernel. // Get partition for this thread group - const details::index_t size = partition_end - partition_start; + const details::index_t size = smem.partition_end - smem.partition_start; // If the size is zero, we can just retire the whole block. if (size == 0) { @@ -342,8 +418,7 @@ TRACCC_DEVICE inline void ccl_kernel( use_scratch = false; } - ccl_core(thread_id, partition_start, partition_end, - use_scratch ? f_backup : f_primary, + ccl_core(thread_id, smem, use_scratch ? f_backup : f_primary, use_scratch ? gf_backup : gf_primary, cell_links, adjv, adjc, cells_device, modules_device, measurements_device, barrier); diff --git a/device/common/include/traccc/device/sort.hpp b/device/common/include/traccc/device/sort.hpp new file mode 100644 index 0000000000..ecfda6d112 --- /dev/null +++ b/device/common/include/traccc/device/sort.hpp @@ -0,0 +1,86 @@ +/** + * traccc library, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +#include +#include + +#include "traccc/definitions/qualifiers.hpp" +#include "traccc/device/concepts/barrier.hpp" +#include "traccc/device/concepts/thread_id.hpp" + +namespace traccc::device { +/** + * @brief Swap two values of arbitrary type. + * + * @tparam T The type of values to swap. + * + * @param a The first object in the swap (will take the value of b). + * @param b The second object in the swap (will take the value of a). + */ +template +TRACCC_DEVICE void swap(T& a, T& b) { + T t = std::move(a); + a = std::move(b); + b = std::move(t); +} + +/** + * @brief Perform a block-wide odd-even key sorting. + * + * This function performs a sorting operation across the entire block, assuming + * that all the threads in the block are currently active. + * + * @warning The behaviour of this function is ill-defined if any of the threads + * in the block have exited. + * + * @warning This method is efficient for sorting small arrays, preferably in + * shared memory, but given the O(n^2) worst-case performance this should not + * be used on larger arrays. + * + * @tparam T The thread identifier type. + * @tparam B The barrier type + * @tparam K The type of keys to sort. + * @tparam C The type of the comparison function. + * + * @param thread_id The thread identifier object. + * @param barrier The barrier to use for block synchronization. + * @param keys An array of keys to sort. + * @param num_keys The number of keys in the array to sort. + * @param comparison A comparison function. + */ +template C> +TRACCC_DEVICE void blockOddEvenSort(T& thread_id, B& barrier, K* keys, + uint32_t num_keys, C&& comparison) { + bool sorted; + + do { + sorted = true; + + for (uint32_t j = 2 * thread_id.getLocalThreadIdX() + 1; + j < num_keys - 1; j += 2 * thread_id.getBlockDimX()) { + if (comparison(keys[j + 1], keys[j])) { + swap(keys[j + 1], keys[j]); + sorted = false; + } + } + + barrier.blockBarrier(); + + for (uint32_t j = 2 * thread_id.getLocalThreadIdX(); j < num_keys - 1; + j += 2 * thread_id.getBlockDimX()) { + if (comparison(keys[j + 1], keys[j])) { + swap(keys[j + 1], keys[j]); + sorted = false; + } + } + } while (barrier.blockOr(!sorted)); +} +} // namespace traccc::device diff --git a/device/cuda/src/clusterization/clusterization_algorithm.cu b/device/cuda/src/clusterization/clusterization_algorithm.cu index 75304bd0a3..57479aa054 100644 --- a/device/cuda/src/clusterization/clusterization_algorithm.cu +++ b/device/cuda/src/clusterization/clusterization_algorithm.cu @@ -42,8 +42,7 @@ __global__ void ccl_kernel( vecmem::data::vector_view adjv_backup_view, unsigned int* backup_mutex_ptr) { - __shared__ std::size_t partition_start, partition_end; - __shared__ std::size_t outi; + __shared__ device::details::ccl_kernel_static_smem_parcel shared; extern __shared__ device::details::index_t shared_v[]; vecmem::device_atomic_ref backup_mutex(*backup_mutex_ptr); @@ -58,9 +57,8 @@ __global__ void ccl_kernel( traccc::cuda::barrier barry_r; const cuda::thread_id1 thread_id; - device::ccl_kernel(cfg, thread_id, cells_view, modules_view, - partition_start, partition_end, outi, f_view, gf_view, - f_backup_view, gf_backup_view, adjc_backup_view, + device::ccl_kernel(cfg, thread_id, cells_view, modules_view, shared, f_view, + gf_view, f_backup_view, gf_backup_view, adjc_backup_view, adjv_backup_view, backup_mutex, barry_r, measurements_view, cell_links); } @@ -140,6 +138,12 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()( m_gf_backup, m_adjc_backup, m_adjv_backup, m_backup_mutex.get()); TRACCC_CUDA_ERROR_CHECK(cudaGetLastError()); +#ifndef NDEBUG + TRACCC_CUDA_ERROR_CHECK(cudaStreamSynchronize(stream)); + assert(is_contiguous_on(measurement_module_projection(), m_mr.main, m_copy, + m_stream, measurements)); +#endif + // Return the reconstructed measurements. return measurements; } diff --git a/device/sycl/src/clusterization/clusterization_algorithm.sycl b/device/sycl/src/clusterization/clusterization_algorithm.sycl index b1ef6f01d8..96d7e63feb 100644 --- a/device/sycl/src/clusterization/clusterization_algorithm.sycl +++ b/device/sycl/src/clusterization/clusterization_algorithm.sycl @@ -117,14 +117,16 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()( details::get_queue(m_queue) .submit([&](::sycl::handler& h) { // Allocate shared memory for the kernel. - vecmem::sycl::local_accessor shared_uint(3, h); + vecmem::sycl::local_accessor< + device::details::ccl_kernel_static_smem_parcel> + shared(1, h); vecmem::sycl::local_accessor shared_idx( 2 * m_config.max_partition_size(), h); // Launch the kernel. h.parallel_for( cclKernelRange, - [shared_uint, shared_idx, cells_view, modules_view, + [shared, shared_idx, cells_view, modules_view, measurements_view, cell_links_view, f_backup_view = vecmem::get_data(m_f_backup), gf_backup_view = vecmem::get_data(m_gf_backup), @@ -139,9 +141,6 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()( vecmem::data::vector_view gf_view{ static_cast(cfg.max_partition_size()), &shared_idx[cfg.max_partition_size()]}; - std::size_t& partition_start = shared_uint[0]; - std::size_t& partition_end = shared_uint[1]; - std::size_t& outi = shared_uint[2]; // Mutex for scratch space vecmem::device_atomic_ref backup_mutex( @@ -152,16 +151,18 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()( const sycl::thread_id1 thread_id(item); // Run the algorithm for this thread. - device::ccl_kernel(cfg, thread_id, cells_view, modules_view, - partition_start, partition_end, outi, - f_view, gf_view, f_backup_view, - gf_backup_view, adjc_backup_view, - adjv_backup_view, backup_mutex, barry_r, - measurements_view, cell_links_view); + device::ccl_kernel( + cfg, thread_id, cells_view, modules_view, shared[0], + f_view, gf_view, f_backup_view, gf_backup_view, + adjc_backup_view, adjv_backup_view, backup_mutex, + barry_r, measurements_view, cell_links_view); }); }) .wait_and_throw(); + assert(is_contiguous_on(measurement_module_projection(), m_mr.main, m_copy, + m_queue, measurements)); + // Return the reconstructed measurements. return measurements; } diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt index 6b53c7bfa4..458592d5a5 100644 --- a/tests/cuda/CMakeLists.txt +++ b/tests/cuda/CMakeLists.txt @@ -44,6 +44,7 @@ traccc_add_test( test_unique_lock.cu test_sanity_contiguous_on.cu test_sanity_ordered_on.cu + test_sort.cu LINK_LIBRARIES CUDA::cudart diff --git a/tests/cuda/test_sort.cu b/tests/cuda/test_sort.cu new file mode 100644 index 0000000000..dc657070a8 --- /dev/null +++ b/tests/cuda/test_sort.cu @@ -0,0 +1,46 @@ +/** + * traccc library, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#include + +#include +#include + +#include "../../cuda/src/utils/barrier.hpp" +#include "traccc/cuda/utils/thread_id.hpp" +#include "traccc/device/sort.hpp" + +__global__ void testBlockSortKernel(uint32_t *keys, uint32_t n_keys) { + traccc::cuda::thread_id1 thread_id; + traccc::cuda::barrier barrier; + traccc::device::blockOddEvenSort(thread_id, barrier, keys, n_keys, + std::less()); +} + +TEST(CUDASort, BlockOddEvenSort) { + vecmem::cuda::managed_memory_resource mr; + + uint32_t n = 2803; + vecmem::unique_alloc_ptr arr = + vecmem::make_unique_alloc(mr, n); + + // As long as 13 and n_keys are coprime, this will generate a big, + // non-sorted array containing every element. + for (uint32_t i = 0; i < n; i++) { + arr[i] = (13 * 500 * i) % n; + } + + testBlockSortKernel<<<1, 1024u>>>(arr.get(), n); + + ASSERT_EQ(cudaPeekAtLastError(), cudaSuccess); + ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); + + for (uint32_t i = 0; i < n; ++i) { + ASSERT_EQ(arr[i], i); + } +} diff --git a/tests/sycl/CMakeLists.txt b/tests/sycl/CMakeLists.txt index 931310d20c..63dbf9ff43 100644 --- a/tests/sycl/CMakeLists.txt +++ b/tests/sycl/CMakeLists.txt @@ -22,6 +22,7 @@ traccc_add_test( test_cca.sycl test_sanity_contiguous_on.sycl test_sanity_ordered_on.sycl + test_sort.sycl LINK_LIBRARIES GTest::gtest_main diff --git a/tests/sycl/test_sort.sycl b/tests/sycl/test_sort.sycl new file mode 100644 index 0000000000..eb4a4ef724 --- /dev/null +++ b/tests/sycl/test_sort.sycl @@ -0,0 +1,51 @@ +/** + * traccc library, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#include + +#include +#include +#include + +#include "../../sycl/src/utils/barrier.hpp" +#include "traccc/device/sort.hpp" +#include "traccc/sycl/utils/thread_id.hpp" + +TEST(SYCLSort, BlockOddEvenSort) { + vecmem::sycl::shared_memory_resource mr; + cl::sycl::queue queue; + + uint32_t n = 2803; + vecmem::unique_alloc_ptr arr = + vecmem::make_unique_alloc(mr, n); + + // As long as 13 and n_keys are coprime, this will generate a big, + // non-sorted array containing every element. + for (uint32_t i = 0; i < n; i++) { + arr[i] = (13 * 500 * i) % n; + } + + cl::sycl::nd_range test_range(cl::sycl::range<1>(128), + cl::sycl::range<1>(128)); + + queue + .submit([&, keys = arr.get()](cl::sycl::handler &h) { + h.parallel_for( + test_range, [=](cl::sycl::nd_item<1> item) { + traccc::sycl::thread_id1 thread_id(item); + traccc::sycl::barrier barrier(item); + traccc::device::blockOddEvenSort(thread_id, barrier, keys, + n, std::less()); + }); + }) + .wait_and_throw(); + + for (uint32_t i = 0; i < n; ++i) { + ASSERT_EQ(arr[i], i); + } +}