From e32233f5517ce5f6839510ff5fa4cb517db93ab4 Mon Sep 17 00:00:00 2001
From: Stephen Nicholas Swatman <stephen.nicholas.swatman@cern.ch>
Date: Thu, 16 Mar 2023 13:52:46 +0100
Subject: [PATCH 1/3] Implement block-wide odd-even sort

Sorting small arrays is a relatively common problem in GPGPU
programming. Many useful algorithms exist, and some are provided by
libraries like CUB. An algorithm close to my heart is odd-even sort
because it is exceedingly simply, relatively efficient for small arrays
and, importantly, it uses O(1) space. This commit adds new
implementations of this sorting algorithm for block-wide odd-even sort
in a portable way.
---
 device/common/include/traccc/device/sort.hpp | 86 ++++++++++++++++++++
 tests/cuda/CMakeLists.txt                    |  1 +
 tests/cuda/test_sort.cu                      | 46 +++++++++++
 tests/sycl/CMakeLists.txt                    |  1 +
 tests/sycl/test_sort.sycl                    | 51 ++++++++++++
 5 files changed, 185 insertions(+)
 create mode 100644 device/common/include/traccc/device/sort.hpp
 create mode 100644 tests/cuda/test_sort.cu
 create mode 100644 tests/sycl/test_sort.sycl
diff --git a/device/common/include/traccc/device/sort.hpp b/device/common/include/traccc/device/sort.hpp
new file mode 100644
index 0000000000..ecfda6d112
--- /dev/null
+++ b/device/common/include/traccc/device/sort.hpp
@@ -0,0 +1,86 @@
+/**
+ * traccc library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <vecmem/memory/device_atomic_ref.hpp>
+
+#include "traccc/definitions/qualifiers.hpp"
+#include "traccc/device/concepts/barrier.hpp"
+#include "traccc/device/concepts/thread_id.hpp"
+
+namespace traccc::device {
+/**
+ * @brief Swap two values of arbitrary type.
+ *
+ * @tparam T The type of values to swap.
+ *
+ * @param a The first object in the swap (will take the value of b).
+ * @param b The second object in the swap (will take the value of a).
+ */
+template <std::movable T>
+TRACCC_DEVICE void swap(T& a, T& b) {
+    T t = std::move(a);
+    a = std::move(b);
+    b = std::move(t);
+}
+
+/**
+ * @brief Perform a block-wide odd-even key sorting.
+ *
+ * This function performs a sorting operation across the entire block, assuming
+ * that all the threads in the block are currently active.
+ *
+ * @warning The behaviour of this function is ill-defined if any of the threads
+ * in the block have exited.
+ *
+ * @warning This method is efficient for sorting small arrays, preferably in
+ * shared memory, but given the O(n^2) worst-case performance this should not
+ * be used on larger arrays.
+ *
+ * @tparam T The thread identifier type.
+ * @tparam B The barrier type
+ * @tparam K The type of keys to sort.
+ * @tparam C The type of the comparison function.
+ *
+ * @param thread_id The thread identifier object.
+ * @param barrier The barrier to use for block synchronization.
+ * @param keys An array of keys to sort.
+ * @param num_keys The number of keys in the array to sort.
+ * @param comparison A comparison function.
+ */
+template <concepts::thread_id1 T, concepts::barrier B, std::movable K,
+          std::strict_weak_order<K, K> C>
+TRACCC_DEVICE void blockOddEvenSort(T& thread_id, B& barrier, K* keys,
+                                    uint32_t num_keys, C&& comparison) {
+    bool sorted;
+
+    do {
+        sorted = true;
+
+        for (uint32_t j = 2 * thread_id.getLocalThreadIdX() + 1;
+             j < num_keys - 1; j += 2 * thread_id.getBlockDimX()) {
+            if (comparison(keys[j + 1], keys[j])) {
+                swap(keys[j + 1], keys[j]);
+                sorted = false;
+            }
+        }
+
+        barrier.blockBarrier();
+
+        for (uint32_t j = 2 * thread_id.getLocalThreadIdX(); j < num_keys - 1;
+             j += 2 * thread_id.getBlockDimX()) {
+            if (comparison(keys[j + 1], keys[j])) {
+                swap(keys[j + 1], keys[j]);
+                sorted = false;
+            }
+        }
+    } while (barrier.blockOr(!sorted));
+}
+}  // namespace traccc::device
diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt
index 6b53c7bfa4..458592d5a5 100644
--- a/tests/cuda/CMakeLists.txt
+++ b/tests/cuda/CMakeLists.txt
@@ -44,6 +44,7 @@ traccc_add_test(
     test_unique_lock.cu
     test_sanity_contiguous_on.cu
     test_sanity_ordered_on.cu
+    test_sort.cu
 
     LINK_LIBRARIES
     CUDA::cudart
diff --git a/tests/cuda/test_sort.cu b/tests/cuda/test_sort.cu
new file mode 100644
index 0000000000..dc657070a8
--- /dev/null
+++ b/tests/cuda/test_sort.cu
@@ -0,0 +1,46 @@
+/**
+ * traccc library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#include <gtest/gtest.h>
+
+#include <vecmem/memory/cuda/managed_memory_resource.hpp>
+#include <vecmem/memory/unique_ptr.hpp>
+
+#include "../../cuda/src/utils/barrier.hpp"
+#include "traccc/cuda/utils/thread_id.hpp"
+#include "traccc/device/sort.hpp"
+
+__global__ void testBlockSortKernel(uint32_t *keys, uint32_t n_keys) {
+    traccc::cuda::thread_id1 thread_id;
+    traccc::cuda::barrier barrier;
+    traccc::device::blockOddEvenSort(thread_id, barrier, keys, n_keys,
+                                     std::less<uint32_t>());
+}
+
+TEST(CUDASort, BlockOddEvenSort) {
+    vecmem::cuda::managed_memory_resource mr;
+
+    uint32_t n = 2803;
+    vecmem::unique_alloc_ptr<uint32_t[]> arr =
+        vecmem::make_unique_alloc<uint32_t[]>(mr, n);
+
+    // As long as 13 and n_keys are coprime, this will generate a big,
+    // non-sorted array containing every element.
+    for (uint32_t i = 0; i < n; i++) {
+        arr[i] = (13 * 500 * i) % n;
+    }
+
+    testBlockSortKernel<<<1, 1024u>>>(arr.get(), n);
+
+    ASSERT_EQ(cudaPeekAtLastError(), cudaSuccess);
+    ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+
+    for (uint32_t i = 0; i < n; ++i) {
+        ASSERT_EQ(arr[i], i);
+    }
+}
diff --git a/tests/sycl/CMakeLists.txt b/tests/sycl/CMakeLists.txt
index 931310d20c..63dbf9ff43 100644
--- a/tests/sycl/CMakeLists.txt
+++ b/tests/sycl/CMakeLists.txt
@@ -22,6 +22,7 @@ traccc_add_test(
     test_cca.sycl
     test_sanity_contiguous_on.sycl
     test_sanity_ordered_on.sycl
+    test_sort.sycl
 
     LINK_LIBRARIES
     GTest::gtest_main
diff --git a/tests/sycl/test_sort.sycl b/tests/sycl/test_sort.sycl
new file mode 100644
index 0000000000..eb4a4ef724
--- /dev/null
+++ b/tests/sycl/test_sort.sycl
@@ -0,0 +1,51 @@
+/**
+ * traccc library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#include <gtest/gtest.h>
+
+#include <CL/sycl.hpp>
+#include <vecmem/memory/sycl/shared_memory_resource.hpp>
+#include <vecmem/memory/unique_ptr.hpp>
+
+#include "../../sycl/src/utils/barrier.hpp"
+#include "traccc/device/sort.hpp"
+#include "traccc/sycl/utils/thread_id.hpp"
+
+TEST(SYCLSort, BlockOddEvenSort) {
+    vecmem::sycl::shared_memory_resource mr;
+    cl::sycl::queue queue;
+
+    uint32_t n = 2803;
+    vecmem::unique_alloc_ptr<uint32_t[]> arr =
+        vecmem::make_unique_alloc<uint32_t[]>(mr, n);
+
+    // As long as 13 and n_keys are coprime, this will generate a big,
+    // non-sorted array containing every element.
+    for (uint32_t i = 0; i < n; i++) {
+        arr[i] = (13 * 500 * i) % n;
+    }
+
+    cl::sycl::nd_range test_range(cl::sycl::range<1>(128),
+                                  cl::sycl::range<1>(128));
+
+    queue
+        .submit([&, keys = arr.get()](cl::sycl::handler &h) {
+            h.parallel_for<class BlockOddEvenSortKernel>(
+                test_range, [=](cl::sycl::nd_item<1> item) {
+                    traccc::sycl::thread_id1 thread_id(item);
+                    traccc::sycl::barrier barrier(item);
+                    traccc::device::blockOddEvenSort(thread_id, barrier, keys,
+                                                     n, std::less<uint32_t>());
+                });
+        })
+        .wait_and_throw();
+
+    for (uint32_t i = 0; i < n; ++i) {
+        ASSERT_EQ(arr[i], i);
+    }
+}

From 3c9c6138b2458034d6c231eafcd0f33dac86be9a Mon Sep 17 00:00:00 2001
From: Stephen Nicholas Swatman <stephen.nicholas.swatman@cern.ch>
Date: Fri, 2 Aug 2024 13:34:37 +0200
Subject: [PATCH 2/3] Move shared CCL variables into single struct

The current CCL kernels have so many parameters that it's a real pain in
the rear to maintain them and to make changes to them. This commit
reduces the number of parameters a little bit by taking all
statically-known shared memory data and unifying it into a single struct
which can be passed around more easily.
---
 .../clusterization_algorithm.cpp              | 12 ++++-------
 .../clusterization/device/ccl_kernel.hpp      |  9 ++++++++-
 .../clusterization/device/impl/ccl_kernel.ipp | 14 ++++++-------
 .../clusterization_algorithm.cu               |  8 +++-----
 .../clusterization_algorithm.sycl             | 20 +++++++++----------
 5 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/device/alpaka/src/clusterization/clusterization_algorithm.cpp b/device/alpaka/src/clusterization/clusterization_algorithm.cpp
index 6e5f116bc9..89617fa39b 100644
--- a/device/alpaka/src/clusterization/clusterization_algorithm.cpp
+++ b/device/alpaka/src/clusterization/clusterization_algorithm.cpp
@@ -38,11 +38,8 @@ struct CCLKernel {
 
         traccc::alpaka::thread_id1 thread_id(acc);
 
-        auto& partition_start =
-            ::alpaka::declareSharedVar<std::size_t, __COUNTER__>(acc);
-        auto& partition_end =
-            ::alpaka::declareSharedVar<std::size_t, __COUNTER__>(acc);
-        auto& outi = ::alpaka::declareSharedVar<std::size_t, __COUNTER__>(acc);
+        auto& shared = ::alpaka::declareSharedVar<
+            device::details::ccl_kernel_static_smem_parcel, __COUNTER__>(acc);
 
         device::details::index_t* const shared_v =
             ::alpaka::getDynSharedMem<device::details::index_t>(acc);
@@ -56,9 +53,8 @@ struct CCLKernel {
 
         alpaka::barrier<TAcc> barry_r(&acc);
 
-        device::ccl_kernel(cfg, thread_id, cells_view, modules_view,
-                           partition_start, partition_end, outi, f_view,
-                           gf_view, f_backup_view, gf_backup_view,
+        device::ccl_kernel(cfg, thread_id, cells_view, modules_view, shared,
+                           f_view, gf_view, f_backup_view, gf_backup_view,
                            adjc_backup_view, adjv_backup_view, backup_mutex,
                            barry_r, measurements_view, cell_links);
     }
diff --git a/device/common/include/traccc/clusterization/device/ccl_kernel.hpp b/device/common/include/traccc/clusterization/device/ccl_kernel.hpp
index 58d709ee42..f9756b01aa 100644
--- a/device/common/include/traccc/clusterization/device/ccl_kernel.hpp
+++ b/device/common/include/traccc/clusterization/device/ccl_kernel.hpp
@@ -26,6 +26,13 @@
 #include <cstddef>
 
 namespace traccc::device {
+namespace details {
+struct ccl_kernel_static_smem_parcel {
+    std::size_t partition_start;
+    std::size_t partition_end;
+    uint32_t outi;
+};
+}  // namespace details
 
 /// Function which reads raw detector cells and turns them into measurements.
 ///
@@ -59,7 +66,7 @@ TRACCC_DEVICE inline void ccl_kernel(
     const clustering_config cfg, const thread_id_t& thread_id,
     const cell_collection_types::const_view cells_view,
     const cell_module_collection_types::const_view modules_view,
-    std::size_t& partition_start, std::size_t& partition_end, std::size_t& outi,
+    details::ccl_kernel_static_smem_parcel& smem,
     vecmem::data::vector_view<details::index_t> f_view,
     vecmem::data::vector_view<details::index_t> gf_view,
     vecmem::data::vector_view<details::index_t> f_backup_view,
diff --git a/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp b/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp
index 773261b344..b133d043ec 100644
--- a/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp
+++ b/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp
@@ -211,7 +211,7 @@ TRACCC_DEVICE inline void ccl_kernel(
     const clustering_config cfg, const thread_id_t& thread_id,
     const cell_collection_types::const_view cells_view,
     const cell_module_collection_types::const_view modules_view,
-    std::size_t& partition_start, std::size_t& partition_end, std::size_t& outi,
+    details::ccl_kernel_static_smem_parcel& smem,
     vecmem::data::vector_view<details::index_t> f_view,
     vecmem::data::vector_view<details::index_t> gf_view,
     vecmem::data::vector_view<details::index_t> f_backup_view,
@@ -252,7 +252,7 @@ TRACCC_DEVICE inline void ccl_kernel(
         assert(start < num_cells);
         std::size_t end =
             std::min(num_cells, start + cfg.target_partition_size());
-        outi = 0;
+        smem.outi = 0;
 
         /*
          * Next, shift the starting point to a position further in the
@@ -280,9 +280,9 @@ TRACCC_DEVICE inline void ccl_kernel(
                    cells_device[end - 1].channel1 + 1) {
             ++end;
         }
-        partition_start = start;
-        partition_end = end;
-        assert(partition_start <= partition_end);
+        smem.partition_start = start;
+        smem.partition_end = end;
+        assert(smem.partition_start <= smem.partition_end);
     }
 
     barrier.blockBarrier();
@@ -303,7 +303,7 @@ TRACCC_DEVICE inline void ccl_kernel(
     // into a return. As such, we cannot use returns in this kernel.
 
     // Get partition for this thread group
-    const details::index_t size = partition_end - partition_start;
+    const details::index_t size = smem.partition_end - smem.partition_start;
 
     // If the size is zero, we can just retire the whole block.
     if (size == 0) {
@@ -342,7 +342,7 @@ TRACCC_DEVICE inline void ccl_kernel(
         use_scratch = false;
     }
 
-    ccl_core(thread_id, partition_start, partition_end,
+    ccl_core(thread_id, smem.partition_start, smem.partition_end,
              use_scratch ? f_backup : f_primary,
              use_scratch ? gf_backup : gf_primary, cell_links, adjv, adjc,
              cells_device, modules_device, measurements_device, barrier);
diff --git a/device/cuda/src/clusterization/clusterization_algorithm.cu b/device/cuda/src/clusterization/clusterization_algorithm.cu
index 75304bd0a3..2c74a0730d 100644
--- a/device/cuda/src/clusterization/clusterization_algorithm.cu
+++ b/device/cuda/src/clusterization/clusterization_algorithm.cu
@@ -42,8 +42,7 @@ __global__ void ccl_kernel(
     vecmem::data::vector_view<device::details::index_t> adjv_backup_view,
     unsigned int* backup_mutex_ptr) {
 
-    __shared__ std::size_t partition_start, partition_end;
-    __shared__ std::size_t outi;
+    __shared__ device::details::ccl_kernel_static_smem_parcel shared;
     extern __shared__ device::details::index_t shared_v[];
     vecmem::device_atomic_ref<unsigned int> backup_mutex(*backup_mutex_ptr);
 
@@ -58,9 +57,8 @@ __global__ void ccl_kernel(
     traccc::cuda::barrier barry_r;
     const cuda::thread_id1 thread_id;
 
-    device::ccl_kernel(cfg, thread_id, cells_view, modules_view,
-                       partition_start, partition_end, outi, f_view, gf_view,
-                       f_backup_view, gf_backup_view, adjc_backup_view,
+    device::ccl_kernel(cfg, thread_id, cells_view, modules_view, shared, f_view,
+                       gf_view, f_backup_view, gf_backup_view, adjc_backup_view,
                        adjv_backup_view, backup_mutex, barry_r,
                        measurements_view, cell_links);
 }
diff --git a/device/sycl/src/clusterization/clusterization_algorithm.sycl b/device/sycl/src/clusterization/clusterization_algorithm.sycl
index b1ef6f01d8..84c147a59e 100644
--- a/device/sycl/src/clusterization/clusterization_algorithm.sycl
+++ b/device/sycl/src/clusterization/clusterization_algorithm.sycl
@@ -117,14 +117,16 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
     details::get_queue(m_queue)
         .submit([&](::sycl::handler& h) {
             // Allocate shared memory for the kernel.
-            vecmem::sycl::local_accessor<std::size_t> shared_uint(3, h);
+            vecmem::sycl::local_accessor<
+                device::details::ccl_kernel_static_smem_parcel>
+                shared(1, h);
             vecmem::sycl::local_accessor<device::details::index_t> shared_idx(
                 2 * m_config.max_partition_size(), h);
 
             // Launch the kernel.
             h.parallel_for<kernels::ccl_kernel>(
                 cclKernelRange,
-                [shared_uint, shared_idx, cells_view, modules_view,
+                [shared, shared_idx, cells_view, modules_view,
                  measurements_view, cell_links_view,
                  f_backup_view = vecmem::get_data(m_f_backup),
                  gf_backup_view = vecmem::get_data(m_gf_backup),
@@ -139,9 +141,6 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
                     vecmem::data::vector_view<device::details::index_t> gf_view{
                         static_cast<vector_size_t>(cfg.max_partition_size()),
                         &shared_idx[cfg.max_partition_size()]};
-                    std::size_t& partition_start = shared_uint[0];
-                    std::size_t& partition_end = shared_uint[1];
-                    std::size_t& outi = shared_uint[2];
 
                     // Mutex for scratch space
                     vecmem::device_atomic_ref<unsigned int> backup_mutex(
@@ -152,12 +151,11 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
                     const sycl::thread_id1 thread_id(item);
 
                     // Run the algorithm for this thread.
-                    device::ccl_kernel(cfg, thread_id, cells_view, modules_view,
-                                       partition_start, partition_end, outi,
-                                       f_view, gf_view, f_backup_view,
-                                       gf_backup_view, adjc_backup_view,
-                                       adjv_backup_view, backup_mutex, barry_r,
-                                       measurements_view, cell_links_view);
+                    device::ccl_kernel(
+                        cfg, thread_id, cells_view, modules_view, shared[0],
+                        f_view, gf_view, f_backup_view, gf_backup_view,
+                        adjc_backup_view, adjv_backup_view, backup_mutex,
+                        barry_r, measurements_view, cell_links_view);
                 });
         })
         .wait_and_throw();

From 9b6e5daf83257996e3032fa8d16ed0c173af60de Mon Sep 17 00:00:00 2001
From: Stephen Nicholas Swatman <stephen.nicholas.swatman@cern.ch>
Date: Tue, 11 Jun 2024 18:22:38 +0200
Subject: [PATCH 3/3] Ensure that CCL output is contiguous on modules

Right now we are using a sorting algorithm to ensure that the entire
array of measurements is contiguous in global memory, but this isn't
strictly necessary. This commit alters the CCL algorithm slightly to
guarantee that the output is always contiguous.
---
 .../clusterization/device/impl/ccl_kernel.ipp | 151 +++++++++++++-----
 .../clusterization_algorithm.cu               |   6 +
 .../clusterization_algorithm.sycl             |   3 +
 3 files changed, 122 insertions(+), 38 deletions(-)

diff --git a/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp b/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp
index b133d043ec..b5bb18aadb 100644
--- a/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp
+++ b/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp
@@ -11,17 +11,19 @@
 
 #include "traccc/clusterization/clustering_config.hpp"
 #include "traccc/clusterization/device/aggregate_cluster.hpp"
+#include "traccc/clusterization/device/ccl_kernel.hpp"
 #include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
 #include "traccc/clusterization/device/reduce_problem_cell.hpp"
 #include "traccc/device/concepts/barrier.hpp"
 #include "traccc/device/concepts/thread_id.hpp"
 #include "traccc/device/mutex.hpp"
+#include "traccc/device/sort.hpp"
 #include "traccc/device/unique_lock.hpp"
 #include "traccc/edm/cell.hpp"
+#include "traccc/edm/measurement.hpp"
 #include "vecmem/memory/device_atomic_ref.hpp"
 
 namespace traccc::device {
-
 /// Implementation of a FastSV algorithm with the following steps:
 ///   1) mix of stochastic and aggressive hooking
 ///   2) shortcutting
@@ -136,15 +138,15 @@ TRACCC_DEVICE void fast_sv_1(const thread_id_t& thread_id,
 template <device::concepts::barrier barrier_t,
           device::concepts::thread_id1 thread_id_t>
 TRACCC_DEVICE inline void ccl_core(
-    const thread_id_t& thread_id, std::size_t& partition_start,
-    std::size_t& partition_end, vecmem::device_vector<details::index_t> f,
+    const thread_id_t& thread_id, details::ccl_kernel_static_smem_parcel& smem,
+    vecmem::device_vector<details::index_t> f,
     vecmem::device_vector<details::index_t> gf,
     vecmem::data::vector_view<unsigned int> cell_links, details::index_t* adjv,
     unsigned char* adjc, const cell_collection_types::const_device cells_device,
     const cell_module_collection_types::const_device modules_device,
     measurement_collection_types::device measurements_device,
     barrier_t& barrier) {
-    const details::index_t size = partition_end - partition_start;
+    const details::index_t size = smem.partition_end - smem.partition_start;
 
     assert(size <= f.size());
     assert(size <= gf.size());
@@ -160,8 +162,8 @@ TRACCC_DEVICE inline void ccl_core(
         const details::index_t cid =
             tst * thread_id.getBlockDimX() + thread_id.getLocalThreadIdX();
         adjc[tst] = 0;
-        reduce_problem_cell(cells_device, cid, partition_start, partition_end,
-                            adjc[tst], &adjv[8 * tst]);
+        reduce_problem_cell(cells_device, cid, smem.partition_start,
+                            smem.partition_end, adjc[tst], &adjv[8 * tst]);
     }
 
     for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
@@ -189,20 +191,96 @@ TRACCC_DEVICE inline void ccl_core(
 
     barrier.blockBarrier();
 
+    /*
+     * We will now repurpose the `gf` shared vector to store the cells which
+     * are parents. For the time being, `outi` will be the size of `gf`.
+     */
+    if (thread_id.getLocalThreadIdX() == 0) {
+        smem.outi = 0;
+    }
+
+    barrier.blockBarrier();
+
+    /*
+     * We now collect the parents into the `gf` array.
+     */
     for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
         const details::index_t cid =
             tst * thread_id.getBlockDimX() + thread_id.getLocalThreadIdX();
+
         if (f.at(cid) == cid) {
-            // Add a new measurement to the output buffer. Remembering its
-            // position inside of the container.
-            const measurement_collection_types::device::size_type meas_pos =
-                measurements_device.push_back({});
-            // Set up the measurement under the appropriate index.
-            aggregate_cluster(
-                cells_device, modules_device, f, partition_start, partition_end,
-                cid, measurements_device.at(meas_pos), cell_links, meas_pos);
+            vecmem::device_atomic_ref<unsigned int,
+                                      vecmem::device_address_space::local>
+                atom(smem.outi);
+            gf.at(atom.fetch_add(1)) = cid;
+        }
+    }
+
+    barrier.blockBarrier();
+
+    uint32_t n_measurements = smem.outi;
+
+    barrier.blockBarrier();
+
+    /*
+     * The lead threat resizes the measurement vector to contain the total
+     * number of measurements. The `outi` variable is now repurposed to contain
+     * the start of this block's region in the output array.
+     */
+    if (thread_id.getLocalThreadIdX() == 0) {
+        smem.outi = measurements_device.bulk_append_implicit(n_measurements);
+    }
+
+    barrier.blockBarrier();
+
+    /*
+     * Sort the measurements so that the modules are ordered, which guarantees
+     * that they will be contiguous.
+     *
+     * TODO: Can this be done more cleverly? Perhaps while collecting the
+     * elements into `gf`.
+     */
+    blockOddEvenSort(
+        thread_id, barrier, gf.data(), n_measurements,
+        [&](const unsigned short lhs, const unsigned short rhs) {
+            return cells_device.at(smem.partition_start + f.at(lhs))
+                       .module_link <
+                   cells_device.at(smem.partition_start + f.at(rhs))
+                       .module_link;
+        });
+
+    barrier.blockBarrier();
+
+    /*
+     * Aggregate the clusters and write them to global memory.
+     */
+    for (uint32_t i = thread_id.getLocalThreadIdX(); i < n_measurements;
+         i += thread_id.getBlockDimX()) {
+        uint32_t meas_pos = smem.outi + i;
+        // Set up the measurement under the appropriate index.
+        aggregate_cluster(cells_device, modules_device, f, smem.partition_start,
+                          smem.partition_end, gf.at(i),
+                          measurements_device.at(meas_pos), cell_links,
+                          meas_pos);
+    }
+}
+
+TRACCC_DEVICE inline std::size_t ccl_partition_find_helper(
+    const cell_collection_types::const_device cells, std::size_t begin,
+    std::size_t target_end) {
+    bool found_good = false;
+    std::size_t last_good;
+
+    for (std::size_t i = begin; i < target_end || !found_good; ++i) {
+        if (i == cells.size() ||
+            cells.at(i - 1).module_link != cells.at(i).module_link) {
+            found_good = true;
+            last_good = i;
         }
     }
+
+    assert(found_good);
+    return last_good;
 }
 
 template <device::concepts::barrier barrier_t,
@@ -236,8 +314,6 @@ TRACCC_DEVICE inline void ccl_kernel(
     mutex<uint32_t> mutex(backup_mutex);
     unique_lock lock(mutex, std::defer_lock);
 
-    const std::size_t num_cells = cells_device.size();
-
     /*
      * First, we determine the exact range of cells that is to be examined
      * by this block of threads. We start from an initial range determined
@@ -247,25 +323,31 @@ TRACCC_DEVICE inline void ccl_kernel(
      * amounts.
      */
     if (thread_id.getLocalThreadIdX() == 0) {
-        std::size_t start =
-            thread_id.getBlockIdX() * cfg.target_partition_size();
-        assert(start < num_cells);
-        std::size_t end =
-            std::min(num_cells, start + cfg.target_partition_size());
+        const std::size_t num_cells = cells_device.size();
+
         smem.outi = 0;
 
+        std::size_t boundary_beg =
+            std::max(1lu, std::min(num_cells, thread_id.getBlockIdX() *
+                                                  cfg.target_partition_size()));
+        std::size_t boundary_mid =
+            std::min(num_cells, (thread_id.getBlockIdX() + 1) *
+                                    cfg.target_partition_size());
+        std::size_t boundary_end =
+            std::min(num_cells, (thread_id.getBlockIdX() + 2) *
+                                    cfg.target_partition_size());
+
         /*
          * Next, shift the starting point to a position further in the
          * array; the purpose of this is to ensure that we are not operating
          * on any cells that have been claimed by the previous block (if
          * any).
          */
-        while (start != 0 &&
-               cells_device[start - 1].module_link ==
-                   cells_device[start].module_link &&
-               cells_device[start].channel1 <=
-                   cells_device[start - 1].channel1 + 1) {
-            ++start;
+        if (thread_id.getBlockIdX() == 0) {
+            smem.partition_start = 0;
+        } else {
+            smem.partition_start = ccl_partition_find_helper(
+                cells_device, boundary_beg, boundary_mid);
         }
 
         /*
@@ -273,15 +355,9 @@ TRACCC_DEVICE inline void ccl_kernel(
          * current block to ensure that we do not end our partition on a
          * cell that is not a possible boundary!
          */
-        while (end < num_cells &&
-               cells_device[end - 1].module_link ==
-                   cells_device[end].module_link &&
-               cells_device[end].channel1 <=
-                   cells_device[end - 1].channel1 + 1) {
-            ++end;
-        }
-        smem.partition_start = start;
-        smem.partition_end = end;
+        smem.partition_end =
+            ccl_partition_find_helper(cells_device, boundary_mid, boundary_end);
+
         assert(smem.partition_start <= smem.partition_end);
     }
 
@@ -342,8 +418,7 @@ TRACCC_DEVICE inline void ccl_kernel(
         use_scratch = false;
     }
 
-    ccl_core(thread_id, smem.partition_start, smem.partition_end,
-             use_scratch ? f_backup : f_primary,
+    ccl_core(thread_id, smem, use_scratch ? f_backup : f_primary,
              use_scratch ? gf_backup : gf_primary, cell_links, adjv, adjc,
              cells_device, modules_device, measurements_device, barrier);
 
diff --git a/device/cuda/src/clusterization/clusterization_algorithm.cu b/device/cuda/src/clusterization/clusterization_algorithm.cu
index 2c74a0730d..57479aa054 100644
--- a/device/cuda/src/clusterization/clusterization_algorithm.cu
+++ b/device/cuda/src/clusterization/clusterization_algorithm.cu
@@ -138,6 +138,12 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
         m_gf_backup, m_adjc_backup, m_adjv_backup, m_backup_mutex.get());
     TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());
 
+#ifndef NDEBUG
+    TRACCC_CUDA_ERROR_CHECK(cudaStreamSynchronize(stream));
+    assert(is_contiguous_on(measurement_module_projection(), m_mr.main, m_copy,
+                            m_stream, measurements));
+#endif
+
     // Return the reconstructed measurements.
     return measurements;
 }
diff --git a/device/sycl/src/clusterization/clusterization_algorithm.sycl b/device/sycl/src/clusterization/clusterization_algorithm.sycl
index 84c147a59e..96d7e63feb 100644
--- a/device/sycl/src/clusterization/clusterization_algorithm.sycl
+++ b/device/sycl/src/clusterization/clusterization_algorithm.sycl
@@ -160,6 +160,9 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
         })
         .wait_and_throw();
 
+    assert(is_contiguous_on(measurement_module_projection(), m_mr.main, m_copy,
+                            m_queue, measurements));
+
     // Return the reconstructed measurements.
     return measurements;
 }