From e32233f5517ce5f6839510ff5fa4cb517db93ab4 Mon Sep 17 00:00:00 2001 From: Stephen Nicholas Swatman Date: Thu, 16 Mar 2023 13:52:46 +0100 Subject: [PATCH 1/3] Implement block-wide odd-even sort Sorting small arrays is a relatively common problem in GPGPU programming. Many useful algorithms exist, and some are provided by libraries like CUB. An algorithm close to my heart is odd-even sort because it is exceedingly simply, relatively efficient for small arrays and, importantly, it uses O(1) space. This commit adds new implementations of this sorting algorithm for block-wide odd-even sort in a portable way. --- device/common/include/traccc/device/sort.hpp | 86 ++++++++++++++++++++ tests/cuda/CMakeLists.txt | 1 + tests/cuda/test_sort.cu | 46 +++++++++++ tests/sycl/CMakeLists.txt | 1 + tests/sycl/test_sort.sycl | 51 ++++++++++++ 5 files changed, 185 insertions(+) create mode 100644 device/common/include/traccc/device/sort.hpp create mode 100644 tests/cuda/test_sort.cu create mode 100644 tests/sycl/test_sort.sycl diff --git a/device/common/include/traccc/device/sort.hpp b/device/common/include/traccc/device/sort.hpp new file mode 100644 index 0000000000..ecfda6d112 --- /dev/null +++ b/device/common/include/traccc/device/sort.hpp @@ -0,0 +1,86 @@ +/** + * traccc library, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +#include +#include + +#include "traccc/definitions/qualifiers.hpp" +#include "traccc/device/concepts/barrier.hpp" +#include "traccc/device/concepts/thread_id.hpp" + +namespace traccc::device { +/** + * @brief Swap two values of arbitrary type. + * + * @tparam T The type of values to swap. + * + * @param a The first object in the swap (will take the value of b). + * @param b The second object in the swap (will take the value of a). + */ +template +TRACCC_DEVICE void swap(T& a, T& b) { + T t = std::move(a); + a = std::move(b); + b = std::move(t); +} + +/** + * @brief Perform a block-wide odd-even key sorting. + * + * This function performs a sorting operation across the entire block, assuming + * that all the threads in the block are currently active. + * + * @warning The behaviour of this function is ill-defined if any of the threads + * in the block have exited. + * + * @warning This method is efficient for sorting small arrays, preferably in + * shared memory, but given the O(n^2) worst-case performance this should not + * be used on larger arrays. + * + * @tparam T The thread identifier type. + * @tparam B The barrier type + * @tparam K The type of keys to sort. + * @tparam C The type of the comparison function. + * + * @param thread_id The thread identifier object. + * @param barrier The barrier to use for block synchronization. + * @param keys An array of keys to sort. + * @param num_keys The number of keys in the array to sort. + * @param comparison A comparison function. + */ +template C> +TRACCC_DEVICE void blockOddEvenSort(T& thread_id, B& barrier, K* keys, + uint32_t num_keys, C&& comparison) { + bool sorted; + + do { + sorted = true; + + for (uint32_t j = 2 * thread_id.getLocalThreadIdX() + 1; + j < num_keys - 1; j += 2 * thread_id.getBlockDimX()) { + if (comparison(keys[j + 1], keys[j])) { + swap(keys[j + 1], keys[j]); + sorted = false; + } + } + + barrier.blockBarrier(); + + for (uint32_t j = 2 * thread_id.getLocalThreadIdX(); j < num_keys - 1; + j += 2 * thread_id.getBlockDimX()) { + if (comparison(keys[j + 1], keys[j])) { + swap(keys[j + 1], keys[j]); + sorted = false; + } + } + } while (barrier.blockOr(!sorted)); +} +} // namespace traccc::device diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt index 6b53c7bfa4..458592d5a5 100644 --- a/tests/cuda/CMakeLists.txt +++ b/tests/cuda/CMakeLists.txt @@ -44,6 +44,7 @@ traccc_add_test( test_unique_lock.cu test_sanity_contiguous_on.cu test_sanity_ordered_on.cu + test_sort.cu LINK_LIBRARIES CUDA::cudart diff --git a/tests/cuda/test_sort.cu b/tests/cuda/test_sort.cu new file mode 100644 index 0000000000..dc657070a8 --- /dev/null +++ b/tests/cuda/test_sort.cu @@ -0,0 +1,46 @@ +/** + * traccc library, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#include + +#include +#include + +#include "../../cuda/src/utils/barrier.hpp" +#include "traccc/cuda/utils/thread_id.hpp" +#include "traccc/device/sort.hpp" + +__global__ void testBlockSortKernel(uint32_t *keys, uint32_t n_keys) { + traccc::cuda::thread_id1 thread_id; + traccc::cuda::barrier barrier; + traccc::device::blockOddEvenSort(thread_id, barrier, keys, n_keys, + std::less()); +} + +TEST(CUDASort, BlockOddEvenSort) { + vecmem::cuda::managed_memory_resource mr; + + uint32_t n = 2803; + vecmem::unique_alloc_ptr arr = + vecmem::make_unique_alloc(mr, n); + + // As long as 13 and n_keys are coprime, this will generate a big, + // non-sorted array containing every element. + for (uint32_t i = 0; i < n; i++) { + arr[i] = (13 * 500 * i) % n; + } + + testBlockSortKernel<<<1, 1024u>>>(arr.get(), n); + + ASSERT_EQ(cudaPeekAtLastError(), cudaSuccess); + ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); + + for (uint32_t i = 0; i < n; ++i) { + ASSERT_EQ(arr[i], i); + } +} diff --git a/tests/sycl/CMakeLists.txt b/tests/sycl/CMakeLists.txt index 931310d20c..63dbf9ff43 100644 --- a/tests/sycl/CMakeLists.txt +++ b/tests/sycl/CMakeLists.txt @@ -22,6 +22,7 @@ traccc_add_test( test_cca.sycl test_sanity_contiguous_on.sycl test_sanity_ordered_on.sycl + test_sort.sycl LINK_LIBRARIES GTest::gtest_main diff --git a/tests/sycl/test_sort.sycl b/tests/sycl/test_sort.sycl new file mode 100644 index 0000000000..eb4a4ef724 --- /dev/null +++ b/tests/sycl/test_sort.sycl @@ -0,0 +1,51 @@ +/** + * traccc library, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#include + +#include +#include +#include + +#include "../../sycl/src/utils/barrier.hpp" +#include "traccc/device/sort.hpp" +#include "traccc/sycl/utils/thread_id.hpp" + +TEST(SYCLSort, BlockOddEvenSort) { + vecmem::sycl::shared_memory_resource mr; + cl::sycl::queue queue; + + uint32_t n = 2803; + vecmem::unique_alloc_ptr arr = + vecmem::make_unique_alloc(mr, n); + + // As long as 13 and n_keys are coprime, this will generate a big, + // non-sorted array containing every element. + for (uint32_t i = 0; i < n; i++) { + arr[i] = (13 * 500 * i) % n; + } + + cl::sycl::nd_range test_range(cl::sycl::range<1>(128), + cl::sycl::range<1>(128)); + + queue + .submit([&, keys = arr.get()](cl::sycl::handler &h) { + h.parallel_for( + test_range, [=](cl::sycl::nd_item<1> item) { + traccc::sycl::thread_id1 thread_id(item); + traccc::sycl::barrier barrier(item); + traccc::device::blockOddEvenSort(thread_id, barrier, keys, + n, std::less()); + }); + }) + .wait_and_throw(); + + for (uint32_t i = 0; i < n; ++i) { + ASSERT_EQ(arr[i], i); + } +} From 3c9c6138b2458034d6c231eafcd0f33dac86be9a Mon Sep 17 00:00:00 2001 From: Stephen Nicholas Swatman Date: Fri, 2 Aug 2024 13:34:37 +0200 Subject: [PATCH 2/3] Move shared CCL variables into single struct The current CCL kernels have so many parameters that it's a real pain in the rear to maintain them and to make changes to them. This commit reduces the number of parameters a little bit by taking all statically-known shared memory data and unifying it into a single struct which can be passed around more easily. --- .../clusterization_algorithm.cpp | 12 ++++------- .../clusterization/device/ccl_kernel.hpp | 9 ++++++++- .../clusterization/device/impl/ccl_kernel.ipp | 14 ++++++------- .../clusterization_algorithm.cu | 8 +++----- .../clusterization_algorithm.sycl | 20 +++++++++---------- 5 files changed, 31 insertions(+), 32 deletions(-) diff --git a/device/alpaka/src/clusterization/clusterization_algorithm.cpp b/device/alpaka/src/clusterization/clusterization_algorithm.cpp index 6e5f116bc9..89617fa39b 100644 --- a/device/alpaka/src/clusterization/clusterization_algorithm.cpp +++ b/device/alpaka/src/clusterization/clusterization_algorithm.cpp @@ -38,11 +38,8 @@ struct CCLKernel { traccc::alpaka::thread_id1 thread_id(acc); - auto& partition_start = - ::alpaka::declareSharedVar(acc); - auto& partition_end = - ::alpaka::declareSharedVar(acc); - auto& outi = ::alpaka::declareSharedVar(acc); + auto& shared = ::alpaka::declareSharedVar< + device::details::ccl_kernel_static_smem_parcel, __COUNTER__>(acc); device::details::index_t* const shared_v = ::alpaka::getDynSharedMem(acc); @@ -56,9 +53,8 @@ struct CCLKernel { alpaka::barrier barry_r(&acc); - device::ccl_kernel(cfg, thread_id, cells_view, modules_view, - partition_start, partition_end, outi, f_view, - gf_view, f_backup_view, gf_backup_view, + device::ccl_kernel(cfg, thread_id, cells_view, modules_view, shared, + f_view, gf_view, f_backup_view, gf_backup_view, adjc_backup_view, adjv_backup_view, backup_mutex, barry_r, measurements_view, cell_links); } diff --git a/device/common/include/traccc/clusterization/device/ccl_kernel.hpp b/device/common/include/traccc/clusterization/device/ccl_kernel.hpp index 58d709ee42..f9756b01aa 100644 --- a/device/common/include/traccc/clusterization/device/ccl_kernel.hpp +++ b/device/common/include/traccc/clusterization/device/ccl_kernel.hpp @@ -26,6 +26,13 @@ #include namespace traccc::device { +namespace details { +struct ccl_kernel_static_smem_parcel { + std::size_t partition_start; + std::size_t partition_end; + uint32_t outi; +}; +} // namespace details /// Function which reads raw detector cells and turns them into measurements. /// @@ -59,7 +66,7 @@ TRACCC_DEVICE inline void ccl_kernel( const clustering_config cfg, const thread_id_t& thread_id, const cell_collection_types::const_view cells_view, const cell_module_collection_types::const_view modules_view, - std::size_t& partition_start, std::size_t& partition_end, std::size_t& outi, + details::ccl_kernel_static_smem_parcel& smem, vecmem::data::vector_view f_view, vecmem::data::vector_view gf_view, vecmem::data::vector_view f_backup_view, diff --git a/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp b/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp index 773261b344..b133d043ec 100644 --- a/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp +++ b/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp @@ -211,7 +211,7 @@ TRACCC_DEVICE inline void ccl_kernel( const clustering_config cfg, const thread_id_t& thread_id, const cell_collection_types::const_view cells_view, const cell_module_collection_types::const_view modules_view, - std::size_t& partition_start, std::size_t& partition_end, std::size_t& outi, + details::ccl_kernel_static_smem_parcel& smem, vecmem::data::vector_view f_view, vecmem::data::vector_view gf_view, vecmem::data::vector_view f_backup_view, @@ -252,7 +252,7 @@ TRACCC_DEVICE inline void ccl_kernel( assert(start < num_cells); std::size_t end = std::min(num_cells, start + cfg.target_partition_size()); - outi = 0; + smem.outi = 0; /* * Next, shift the starting point to a position further in the @@ -280,9 +280,9 @@ TRACCC_DEVICE inline void ccl_kernel( cells_device[end - 1].channel1 + 1) { ++end; } - partition_start = start; - partition_end = end; - assert(partition_start <= partition_end); + smem.partition_start = start; + smem.partition_end = end; + assert(smem.partition_start <= smem.partition_end); } barrier.blockBarrier(); @@ -303,7 +303,7 @@ TRACCC_DEVICE inline void ccl_kernel( // into a return. As such, we cannot use returns in this kernel. // Get partition for this thread group - const details::index_t size = partition_end - partition_start; + const details::index_t size = smem.partition_end - smem.partition_start; // If the size is zero, we can just retire the whole block. if (size == 0) { @@ -342,7 +342,7 @@ TRACCC_DEVICE inline void ccl_kernel( use_scratch = false; } - ccl_core(thread_id, partition_start, partition_end, + ccl_core(thread_id, smem.partition_start, smem.partition_end, use_scratch ? f_backup : f_primary, use_scratch ? gf_backup : gf_primary, cell_links, adjv, adjc, cells_device, modules_device, measurements_device, barrier); diff --git a/device/cuda/src/clusterization/clusterization_algorithm.cu b/device/cuda/src/clusterization/clusterization_algorithm.cu index 75304bd0a3..2c74a0730d 100644 --- a/device/cuda/src/clusterization/clusterization_algorithm.cu +++ b/device/cuda/src/clusterization/clusterization_algorithm.cu @@ -42,8 +42,7 @@ __global__ void ccl_kernel( vecmem::data::vector_view adjv_backup_view, unsigned int* backup_mutex_ptr) { - __shared__ std::size_t partition_start, partition_end; - __shared__ std::size_t outi; + __shared__ device::details::ccl_kernel_static_smem_parcel shared; extern __shared__ device::details::index_t shared_v[]; vecmem::device_atomic_ref backup_mutex(*backup_mutex_ptr); @@ -58,9 +57,8 @@ __global__ void ccl_kernel( traccc::cuda::barrier barry_r; const cuda::thread_id1 thread_id; - device::ccl_kernel(cfg, thread_id, cells_view, modules_view, - partition_start, partition_end, outi, f_view, gf_view, - f_backup_view, gf_backup_view, adjc_backup_view, + device::ccl_kernel(cfg, thread_id, cells_view, modules_view, shared, f_view, + gf_view, f_backup_view, gf_backup_view, adjc_backup_view, adjv_backup_view, backup_mutex, barry_r, measurements_view, cell_links); } diff --git a/device/sycl/src/clusterization/clusterization_algorithm.sycl b/device/sycl/src/clusterization/clusterization_algorithm.sycl index b1ef6f01d8..84c147a59e 100644 --- a/device/sycl/src/clusterization/clusterization_algorithm.sycl +++ b/device/sycl/src/clusterization/clusterization_algorithm.sycl @@ -117,14 +117,16 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()( details::get_queue(m_queue) .submit([&](::sycl::handler& h) { // Allocate shared memory for the kernel. - vecmem::sycl::local_accessor shared_uint(3, h); + vecmem::sycl::local_accessor< + device::details::ccl_kernel_static_smem_parcel> + shared(1, h); vecmem::sycl::local_accessor shared_idx( 2 * m_config.max_partition_size(), h); // Launch the kernel. h.parallel_for( cclKernelRange, - [shared_uint, shared_idx, cells_view, modules_view, + [shared, shared_idx, cells_view, modules_view, measurements_view, cell_links_view, f_backup_view = vecmem::get_data(m_f_backup), gf_backup_view = vecmem::get_data(m_gf_backup), @@ -139,9 +141,6 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()( vecmem::data::vector_view gf_view{ static_cast(cfg.max_partition_size()), &shared_idx[cfg.max_partition_size()]}; - std::size_t& partition_start = shared_uint[0]; - std::size_t& partition_end = shared_uint[1]; - std::size_t& outi = shared_uint[2]; // Mutex for scratch space vecmem::device_atomic_ref backup_mutex( @@ -152,12 +151,11 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()( const sycl::thread_id1 thread_id(item); // Run the algorithm for this thread. - device::ccl_kernel(cfg, thread_id, cells_view, modules_view, - partition_start, partition_end, outi, - f_view, gf_view, f_backup_view, - gf_backup_view, adjc_backup_view, - adjv_backup_view, backup_mutex, barry_r, - measurements_view, cell_links_view); + device::ccl_kernel( + cfg, thread_id, cells_view, modules_view, shared[0], + f_view, gf_view, f_backup_view, gf_backup_view, + adjc_backup_view, adjv_backup_view, backup_mutex, + barry_r, measurements_view, cell_links_view); }); }) .wait_and_throw(); From 9b6e5daf83257996e3032fa8d16ed0c173af60de Mon Sep 17 00:00:00 2001 From: Stephen Nicholas Swatman Date: Tue, 11 Jun 2024 18:22:38 +0200 Subject: [PATCH 3/3] Ensure that CCL output is contiguous on modules Right now we are using a sorting algorithm to ensure that the entire array of measurements is contiguous in global memory, but this isn't strictly necessary. This commit alters the CCL algorithm slightly to guarantee that the output is always contiguous. --- .../clusterization/device/impl/ccl_kernel.ipp | 151 +++++++++++++----- .../clusterization_algorithm.cu | 6 + .../clusterization_algorithm.sycl | 3 + 3 files changed, 122 insertions(+), 38 deletions(-) diff --git a/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp b/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp index b133d043ec..b5bb18aadb 100644 --- a/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp +++ b/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp @@ -11,17 +11,19 @@ #include "traccc/clusterization/clustering_config.hpp" #include "traccc/clusterization/device/aggregate_cluster.hpp" +#include "traccc/clusterization/device/ccl_kernel.hpp" #include "traccc/clusterization/device/ccl_kernel_definitions.hpp" #include "traccc/clusterization/device/reduce_problem_cell.hpp" #include "traccc/device/concepts/barrier.hpp" #include "traccc/device/concepts/thread_id.hpp" #include "traccc/device/mutex.hpp" +#include "traccc/device/sort.hpp" #include "traccc/device/unique_lock.hpp" #include "traccc/edm/cell.hpp" +#include "traccc/edm/measurement.hpp" #include "vecmem/memory/device_atomic_ref.hpp" namespace traccc::device { - /// Implementation of a FastSV algorithm with the following steps: /// 1) mix of stochastic and aggressive hooking /// 2) shortcutting @@ -136,15 +138,15 @@ TRACCC_DEVICE void fast_sv_1(const thread_id_t& thread_id, template TRACCC_DEVICE inline void ccl_core( - const thread_id_t& thread_id, std::size_t& partition_start, - std::size_t& partition_end, vecmem::device_vector f, + const thread_id_t& thread_id, details::ccl_kernel_static_smem_parcel& smem, + vecmem::device_vector f, vecmem::device_vector gf, vecmem::data::vector_view cell_links, details::index_t* adjv, unsigned char* adjc, const cell_collection_types::const_device cells_device, const cell_module_collection_types::const_device modules_device, measurement_collection_types::device measurements_device, barrier_t& barrier) { - const details::index_t size = partition_end - partition_start; + const details::index_t size = smem.partition_end - smem.partition_start; assert(size <= f.size()); assert(size <= gf.size()); @@ -160,8 +162,8 @@ TRACCC_DEVICE inline void ccl_core( const details::index_t cid = tst * thread_id.getBlockDimX() + thread_id.getLocalThreadIdX(); adjc[tst] = 0; - reduce_problem_cell(cells_device, cid, partition_start, partition_end, - adjc[tst], &adjv[8 * tst]); + reduce_problem_cell(cells_device, cid, smem.partition_start, + smem.partition_end, adjc[tst], &adjv[8 * tst]); } for (details::index_t tst = 0; tst < thread_cell_count; ++tst) { @@ -189,20 +191,96 @@ TRACCC_DEVICE inline void ccl_core( barrier.blockBarrier(); + /* + * We will now repurpose the `gf` shared vector to store the cells which + * are parents. For the time being, `outi` will be the size of `gf`. + */ + if (thread_id.getLocalThreadIdX() == 0) { + smem.outi = 0; + } + + barrier.blockBarrier(); + + /* + * We now collect the parents into the `gf` array. + */ for (details::index_t tst = 0; tst < thread_cell_count; ++tst) { const details::index_t cid = tst * thread_id.getBlockDimX() + thread_id.getLocalThreadIdX(); + if (f.at(cid) == cid) { - // Add a new measurement to the output buffer. Remembering its - // position inside of the container. - const measurement_collection_types::device::size_type meas_pos = - measurements_device.push_back({}); - // Set up the measurement under the appropriate index. - aggregate_cluster( - cells_device, modules_device, f, partition_start, partition_end, - cid, measurements_device.at(meas_pos), cell_links, meas_pos); + vecmem::device_atomic_ref + atom(smem.outi); + gf.at(atom.fetch_add(1)) = cid; + } + } + + barrier.blockBarrier(); + + uint32_t n_measurements = smem.outi; + + barrier.blockBarrier(); + + /* + * The lead threat resizes the measurement vector to contain the total + * number of measurements. The `outi` variable is now repurposed to contain + * the start of this block's region in the output array. + */ + if (thread_id.getLocalThreadIdX() == 0) { + smem.outi = measurements_device.bulk_append_implicit(n_measurements); + } + + barrier.blockBarrier(); + + /* + * Sort the measurements so that the modules are ordered, which guarantees + * that they will be contiguous. + * + * TODO: Can this be done more cleverly? Perhaps while collecting the + * elements into `gf`. + */ + blockOddEvenSort( + thread_id, barrier, gf.data(), n_measurements, + [&](const unsigned short lhs, const unsigned short rhs) { + return cells_device.at(smem.partition_start + f.at(lhs)) + .module_link < + cells_device.at(smem.partition_start + f.at(rhs)) + .module_link; + }); + + barrier.blockBarrier(); + + /* + * Aggregate the clusters and write them to global memory. + */ + for (uint32_t i = thread_id.getLocalThreadIdX(); i < n_measurements; + i += thread_id.getBlockDimX()) { + uint32_t meas_pos = smem.outi + i; + // Set up the measurement under the appropriate index. + aggregate_cluster(cells_device, modules_device, f, smem.partition_start, + smem.partition_end, gf.at(i), + measurements_device.at(meas_pos), cell_links, + meas_pos); + } +} + +TRACCC_DEVICE inline std::size_t ccl_partition_find_helper( + const cell_collection_types::const_device cells, std::size_t begin, + std::size_t target_end) { + bool found_good = false; + std::size_t last_good; + + for (std::size_t i = begin; i < target_end || !found_good; ++i) { + if (i == cells.size() || + cells.at(i - 1).module_link != cells.at(i).module_link) { + found_good = true; + last_good = i; } } + + assert(found_good); + return last_good; } template mutex(backup_mutex); unique_lock lock(mutex, std::defer_lock); - const std::size_t num_cells = cells_device.size(); - /* * First, we determine the exact range of cells that is to be examined * by this block of threads. We start from an initial range determined @@ -247,25 +323,31 @@ TRACCC_DEVICE inline void ccl_kernel( * amounts. */ if (thread_id.getLocalThreadIdX() == 0) { - std::size_t start = - thread_id.getBlockIdX() * cfg.target_partition_size(); - assert(start < num_cells); - std::size_t end = - std::min(num_cells, start + cfg.target_partition_size()); + const std::size_t num_cells = cells_device.size(); + smem.outi = 0; + std::size_t boundary_beg = + std::max(1lu, std::min(num_cells, thread_id.getBlockIdX() * + cfg.target_partition_size())); + std::size_t boundary_mid = + std::min(num_cells, (thread_id.getBlockIdX() + 1) * + cfg.target_partition_size()); + std::size_t boundary_end = + std::min(num_cells, (thread_id.getBlockIdX() + 2) * + cfg.target_partition_size()); + /* * Next, shift the starting point to a position further in the * array; the purpose of this is to ensure that we are not operating * on any cells that have been claimed by the previous block (if * any). */ - while (start != 0 && - cells_device[start - 1].module_link == - cells_device[start].module_link && - cells_device[start].channel1 <= - cells_device[start - 1].channel1 + 1) { - ++start; + if (thread_id.getBlockIdX() == 0) { + smem.partition_start = 0; + } else { + smem.partition_start = ccl_partition_find_helper( + cells_device, boundary_beg, boundary_mid); } /* @@ -273,15 +355,9 @@ TRACCC_DEVICE inline void ccl_kernel( * current block to ensure that we do not end our partition on a * cell that is not a possible boundary! */ - while (end < num_cells && - cells_device[end - 1].module_link == - cells_device[end].module_link && - cells_device[end].channel1 <= - cells_device[end - 1].channel1 + 1) { - ++end; - } - smem.partition_start = start; - smem.partition_end = end; + smem.partition_end = + ccl_partition_find_helper(cells_device, boundary_mid, boundary_end); + assert(smem.partition_start <= smem.partition_end); } @@ -342,8 +418,7 @@ TRACCC_DEVICE inline void ccl_kernel( use_scratch = false; } - ccl_core(thread_id, smem.partition_start, smem.partition_end, - use_scratch ? f_backup : f_primary, + ccl_core(thread_id, smem, use_scratch ? f_backup : f_primary, use_scratch ? gf_backup : gf_primary, cell_links, adjv, adjc, cells_device, modules_device, measurements_device, barrier); diff --git a/device/cuda/src/clusterization/clusterization_algorithm.cu b/device/cuda/src/clusterization/clusterization_algorithm.cu index 2c74a0730d..57479aa054 100644 --- a/device/cuda/src/clusterization/clusterization_algorithm.cu +++ b/device/cuda/src/clusterization/clusterization_algorithm.cu @@ -138,6 +138,12 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()( m_gf_backup, m_adjc_backup, m_adjv_backup, m_backup_mutex.get()); TRACCC_CUDA_ERROR_CHECK(cudaGetLastError()); +#ifndef NDEBUG + TRACCC_CUDA_ERROR_CHECK(cudaStreamSynchronize(stream)); + assert(is_contiguous_on(measurement_module_projection(), m_mr.main, m_copy, + m_stream, measurements)); +#endif + // Return the reconstructed measurements. return measurements; } diff --git a/device/sycl/src/clusterization/clusterization_algorithm.sycl b/device/sycl/src/clusterization/clusterization_algorithm.sycl index 84c147a59e..96d7e63feb 100644 --- a/device/sycl/src/clusterization/clusterization_algorithm.sycl +++ b/device/sycl/src/clusterization/clusterization_algorithm.sycl @@ -160,6 +160,9 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()( }) .wait_and_throw(); + assert(is_contiguous_on(measurement_module_projection(), m_mr.main, m_copy, + m_queue, measurements)); + // Return the reconstructed measurements. return measurements; }