diff --git a/device/alpaka/src/clusterization/clusterization_algorithm.cpp b/device/alpaka/src/clusterization/clusterization_algorithm.cpp
index 6e5f116bc9..89617fa39b 100644
--- a/device/alpaka/src/clusterization/clusterization_algorithm.cpp
+++ b/device/alpaka/src/clusterization/clusterization_algorithm.cpp
@@ -38,11 +38,8 @@ struct CCLKernel {
 
         traccc::alpaka::thread_id1 thread_id(acc);
 
-        auto& partition_start =
-            ::alpaka::declareSharedVar<std::size_t, __COUNTER__>(acc);
-        auto& partition_end =
-            ::alpaka::declareSharedVar<std::size_t, __COUNTER__>(acc);
-        auto& outi = ::alpaka::declareSharedVar<std::size_t, __COUNTER__>(acc);
+        auto& shared = ::alpaka::declareSharedVar<
+            device::details::ccl_kernel_static_smem_parcel, __COUNTER__>(acc);
 
         device::details::index_t* const shared_v =
             ::alpaka::getDynSharedMem<device::details::index_t>(acc);
@@ -56,9 +53,8 @@ struct CCLKernel {
 
         alpaka::barrier<TAcc> barry_r(&acc);
 
-        device::ccl_kernel(cfg, thread_id, cells_view, modules_view,
-                           partition_start, partition_end, outi, f_view,
-                           gf_view, f_backup_view, gf_backup_view,
+        device::ccl_kernel(cfg, thread_id, cells_view, modules_view, shared,
+                           f_view, gf_view, f_backup_view, gf_backup_view,
                            adjc_backup_view, adjv_backup_view, backup_mutex,
                            barry_r, measurements_view, cell_links);
     }
diff --git a/device/common/include/traccc/clusterization/device/ccl_kernel.hpp b/device/common/include/traccc/clusterization/device/ccl_kernel.hpp
index 58d709ee42..f9756b01aa 100644
--- a/device/common/include/traccc/clusterization/device/ccl_kernel.hpp
+++ b/device/common/include/traccc/clusterization/device/ccl_kernel.hpp
@@ -26,6 +26,13 @@
 #include <cstddef>
 
 namespace traccc::device {
+namespace details {
+struct ccl_kernel_static_smem_parcel {
+    std::size_t partition_start;
+    std::size_t partition_end;
+    uint32_t outi;
+};
+}  // namespace details
 
 /// Function which reads raw detector cells and turns them into measurements.
 ///
@@ -59,7 +66,7 @@ TRACCC_DEVICE inline void ccl_kernel(
     const clustering_config cfg, const thread_id_t& thread_id,
     const cell_collection_types::const_view cells_view,
     const cell_module_collection_types::const_view modules_view,
-    std::size_t& partition_start, std::size_t& partition_end, std::size_t& outi,
+    details::ccl_kernel_static_smem_parcel& smem,
     vecmem::data::vector_view<details::index_t> f_view,
     vecmem::data::vector_view<details::index_t> gf_view,
     vecmem::data::vector_view<details::index_t> f_backup_view,
diff --git a/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp b/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp
index 773261b344..b5bb18aadb 100644
--- a/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp
+++ b/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp
@@ -11,17 +11,19 @@
 
 #include "traccc/clusterization/clustering_config.hpp"
 #include "traccc/clusterization/device/aggregate_cluster.hpp"
+#include "traccc/clusterization/device/ccl_kernel.hpp"
 #include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
 #include "traccc/clusterization/device/reduce_problem_cell.hpp"
 #include "traccc/device/concepts/barrier.hpp"
 #include "traccc/device/concepts/thread_id.hpp"
 #include "traccc/device/mutex.hpp"
+#include "traccc/device/sort.hpp"
 #include "traccc/device/unique_lock.hpp"
 #include "traccc/edm/cell.hpp"
+#include "traccc/edm/measurement.hpp"
 #include "vecmem/memory/device_atomic_ref.hpp"
 
 namespace traccc::device {
-
 /// Implementation of a FastSV algorithm with the following steps:
 ///   1) mix of stochastic and aggressive hooking
 ///   2) shortcutting
@@ -136,15 +138,15 @@ TRACCC_DEVICE void fast_sv_1(const thread_id_t& thread_id,
 template <device::concepts::barrier barrier_t,
           device::concepts::thread_id1 thread_id_t>
 TRACCC_DEVICE inline void ccl_core(
-    const thread_id_t& thread_id, std::size_t& partition_start,
-    std::size_t& partition_end, vecmem::device_vector<details::index_t> f,
+    const thread_id_t& thread_id, details::ccl_kernel_static_smem_parcel& smem,
+    vecmem::device_vector<details::index_t> f,
     vecmem::device_vector<details::index_t> gf,
     vecmem::data::vector_view<unsigned int> cell_links, details::index_t* adjv,
     unsigned char* adjc, const cell_collection_types::const_device cells_device,
     const cell_module_collection_types::const_device modules_device,
     measurement_collection_types::device measurements_device,
     barrier_t& barrier) {
-    const details::index_t size = partition_end - partition_start;
+    const details::index_t size = smem.partition_end - smem.partition_start;
 
     assert(size <= f.size());
     assert(size <= gf.size());
@@ -160,8 +162,8 @@ TRACCC_DEVICE inline void ccl_core(
         const details::index_t cid =
             tst * thread_id.getBlockDimX() + thread_id.getLocalThreadIdX();
         adjc[tst] = 0;
-        reduce_problem_cell(cells_device, cid, partition_start, partition_end,
-                            adjc[tst], &adjv[8 * tst]);
+        reduce_problem_cell(cells_device, cid, smem.partition_start,
+                            smem.partition_end, adjc[tst], &adjv[8 * tst]);
     }
 
     for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
@@ -189,20 +191,96 @@ TRACCC_DEVICE inline void ccl_core(
 
     barrier.blockBarrier();
 
+    /*
+     * We will now repurpose the `gf` shared vector to store the cells which
+     * are parents. For the time being, `outi` will be the size of `gf`.
+     */
+    if (thread_id.getLocalThreadIdX() == 0) {
+        smem.outi = 0;
+    }
+
+    barrier.blockBarrier();
+
+    /*
+     * We now collect the parents into the `gf` array.
+     */
     for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
         const details::index_t cid =
             tst * thread_id.getBlockDimX() + thread_id.getLocalThreadIdX();
+
         if (f.at(cid) == cid) {
-            // Add a new measurement to the output buffer. Remembering its
-            // position inside of the container.
-            const measurement_collection_types::device::size_type meas_pos =
-                measurements_device.push_back({});
-            // Set up the measurement under the appropriate index.
-            aggregate_cluster(
-                cells_device, modules_device, f, partition_start, partition_end,
-                cid, measurements_device.at(meas_pos), cell_links, meas_pos);
+            vecmem::device_atomic_ref<unsigned int,
+                                      vecmem::device_address_space::local>
+                atom(smem.outi);
+            gf.at(atom.fetch_add(1)) = cid;
+        }
+    }
+
+    barrier.blockBarrier();
+
+    uint32_t n_measurements = smem.outi;
+
+    barrier.blockBarrier();
+
+    /*
+     * The lead threat resizes the measurement vector to contain the total
+     * number of measurements. The `outi` variable is now repurposed to contain
+     * the start of this block's region in the output array.
+     */
+    if (thread_id.getLocalThreadIdX() == 0) {
+        smem.outi = measurements_device.bulk_append_implicit(n_measurements);
+    }
+
+    barrier.blockBarrier();
+
+    /*
+     * Sort the measurements so that the modules are ordered, which guarantees
+     * that they will be contiguous.
+     *
+     * TODO: Can this be done more cleverly? Perhaps while collecting the
+     * elements into `gf`.
+     */
+    blockOddEvenSort(
+        thread_id, barrier, gf.data(), n_measurements,
+        [&](const unsigned short lhs, const unsigned short rhs) {
+            return cells_device.at(smem.partition_start + f.at(lhs))
+                       .module_link <
+                   cells_device.at(smem.partition_start + f.at(rhs))
+                       .module_link;
+        });
+
+    barrier.blockBarrier();
+
+    /*
+     * Aggregate the clusters and write them to global memory.
+     */
+    for (uint32_t i = thread_id.getLocalThreadIdX(); i < n_measurements;
+         i += thread_id.getBlockDimX()) {
+        uint32_t meas_pos = smem.outi + i;
+        // Set up the measurement under the appropriate index.
+        aggregate_cluster(cells_device, modules_device, f, smem.partition_start,
+                          smem.partition_end, gf.at(i),
+                          measurements_device.at(meas_pos), cell_links,
+                          meas_pos);
+    }
+}
+
+TRACCC_DEVICE inline std::size_t ccl_partition_find_helper(
+    const cell_collection_types::const_device cells, std::size_t begin,
+    std::size_t target_end) {
+    bool found_good = false;
+    std::size_t last_good;
+
+    for (std::size_t i = begin; i < target_end || !found_good; ++i) {
+        if (i == cells.size() ||
+            cells.at(i - 1).module_link != cells.at(i).module_link) {
+            found_good = true;
+            last_good = i;
         }
     }
+
+    assert(found_good);
+    return last_good;
 }
 
 template <device::concepts::barrier barrier_t,
@@ -211,7 +289,7 @@ TRACCC_DEVICE inline void ccl_kernel(
     const clustering_config cfg, const thread_id_t& thread_id,
     const cell_collection_types::const_view cells_view,
     const cell_module_collection_types::const_view modules_view,
-    std::size_t& partition_start, std::size_t& partition_end, std::size_t& outi,
+    details::ccl_kernel_static_smem_parcel& smem,
     vecmem::data::vector_view<details::index_t> f_view,
     vecmem::data::vector_view<details::index_t> gf_view,
     vecmem::data::vector_view<details::index_t> f_backup_view,
@@ -236,8 +314,6 @@ TRACCC_DEVICE inline void ccl_kernel(
     mutex<uint32_t> mutex(backup_mutex);
     unique_lock lock(mutex, std::defer_lock);
 
-    const std::size_t num_cells = cells_device.size();
-
     /*
      * First, we determine the exact range of cells that is to be examined
      * by this block of threads. We start from an initial range determined
@@ -247,12 +323,19 @@ TRACCC_DEVICE inline void ccl_kernel(
      * amounts.
      */
     if (thread_id.getLocalThreadIdX() == 0) {
-        std::size_t start =
-            thread_id.getBlockIdX() * cfg.target_partition_size();
-        assert(start < num_cells);
-        std::size_t end =
-            std::min(num_cells, start + cfg.target_partition_size());
-        outi = 0;
+        const std::size_t num_cells = cells_device.size();
+
+        smem.outi = 0;
+
+        std::size_t boundary_beg =
+            std::max(1lu, std::min(num_cells, thread_id.getBlockIdX() *
+                                                  cfg.target_partition_size()));
+        std::size_t boundary_mid =
+            std::min(num_cells, (thread_id.getBlockIdX() + 1) *
+                                    cfg.target_partition_size());
+        std::size_t boundary_end =
+            std::min(num_cells, (thread_id.getBlockIdX() + 2) *
+                                    cfg.target_partition_size());
 
         /*
          * Next, shift the starting point to a position further in the
@@ -260,12 +343,11 @@ TRACCC_DEVICE inline void ccl_kernel(
          * on any cells that have been claimed by the previous block (if
          * any).
          */
-        while (start != 0 &&
-               cells_device[start - 1].module_link ==
-                   cells_device[start].module_link &&
-               cells_device[start].channel1 <=
-                   cells_device[start - 1].channel1 + 1) {
-            ++start;
+        if (thread_id.getBlockIdX() == 0) {
+            smem.partition_start = 0;
+        } else {
+            smem.partition_start = ccl_partition_find_helper(
+                cells_device, boundary_beg, boundary_mid);
         }
 
         /*
@@ -273,16 +355,10 @@ TRACCC_DEVICE inline void ccl_kernel(
          * current block to ensure that we do not end our partition on a
          * cell that is not a possible boundary!
          */
-        while (end < num_cells &&
-               cells_device[end - 1].module_link ==
-                   cells_device[end].module_link &&
-               cells_device[end].channel1 <=
-                   cells_device[end - 1].channel1 + 1) {
-            ++end;
-        }
-        partition_start = start;
-        partition_end = end;
-        assert(partition_start <= partition_end);
+        smem.partition_end =
+            ccl_partition_find_helper(cells_device, boundary_mid, boundary_end);
+
+        assert(smem.partition_start <= smem.partition_end);
     }
 
     barrier.blockBarrier();
@@ -303,7 +379,7 @@ TRACCC_DEVICE inline void ccl_kernel(
     // into a return. As such, we cannot use returns in this kernel.
 
     // Get partition for this thread group
-    const details::index_t size = partition_end - partition_start;
+    const details::index_t size = smem.partition_end - smem.partition_start;
 
     // If the size is zero, we can just retire the whole block.
     if (size == 0) {
@@ -342,8 +418,7 @@ TRACCC_DEVICE inline void ccl_kernel(
         use_scratch = false;
     }
 
-    ccl_core(thread_id, partition_start, partition_end,
-             use_scratch ? f_backup : f_primary,
+    ccl_core(thread_id, smem, use_scratch ? f_backup : f_primary,
              use_scratch ? gf_backup : gf_primary, cell_links, adjv, adjc,
              cells_device, modules_device, measurements_device, barrier);
 
diff --git a/device/common/include/traccc/device/sort.hpp b/device/common/include/traccc/device/sort.hpp
new file mode 100644
index 0000000000..ecfda6d112
--- /dev/null
+++ b/device/common/include/traccc/device/sort.hpp
@@ -0,0 +1,86 @@
+/**
+ * traccc library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <vecmem/memory/device_atomic_ref.hpp>
+
+#include "traccc/definitions/qualifiers.hpp"
+#include "traccc/device/concepts/barrier.hpp"
+#include "traccc/device/concepts/thread_id.hpp"
+
+namespace traccc::device {
+/**
+ * @brief Swap two values of arbitrary type.
+ *
+ * @tparam T The type of values to swap.
+ *
+ * @param a The first object in the swap (will take the value of b).
+ * @param b The second object in the swap (will take the value of a).
+ */
+template <std::movable T>
+TRACCC_DEVICE void swap(T& a, T& b) {
+    T t = std::move(a);
+    a = std::move(b);
+    b = std::move(t);
+}
+
+/**
+ * @brief Perform a block-wide odd-even key sorting.
+ *
+ * This function performs a sorting operation across the entire block, assuming
+ * that all the threads in the block are currently active.
+ *
+ * @warning The behaviour of this function is ill-defined if any of the threads
+ * in the block have exited.
+ *
+ * @warning This method is efficient for sorting small arrays, preferably in
+ * shared memory, but given the O(n^2) worst-case performance this should not
+ * be used on larger arrays.
+ *
+ * @tparam T The thread identifier type.
+ * @tparam B The barrier type
+ * @tparam K The type of keys to sort.
+ * @tparam C The type of the comparison function.
+ *
+ * @param thread_id The thread identifier object.
+ * @param barrier The barrier to use for block synchronization.
+ * @param keys An array of keys to sort.
+ * @param num_keys The number of keys in the array to sort.
+ * @param comparison A comparison function.
+ */
+template <concepts::thread_id1 T, concepts::barrier B, std::movable K,
+          std::strict_weak_order<K, K> C>
+TRACCC_DEVICE void blockOddEvenSort(T& thread_id, B& barrier, K* keys,
+                                    uint32_t num_keys, C&& comparison) {
+    bool sorted;
+
+    do {
+        sorted = true;
+
+        for (uint32_t j = 2 * thread_id.getLocalThreadIdX() + 1;
+             j < num_keys - 1; j += 2 * thread_id.getBlockDimX()) {
+            if (comparison(keys[j + 1], keys[j])) {
+                swap(keys[j + 1], keys[j]);
+                sorted = false;
+            }
+        }
+
+        barrier.blockBarrier();
+
+        for (uint32_t j = 2 * thread_id.getLocalThreadIdX(); j < num_keys - 1;
+             j += 2 * thread_id.getBlockDimX()) {
+            if (comparison(keys[j + 1], keys[j])) {
+                swap(keys[j + 1], keys[j]);
+                sorted = false;
+            }
+        }
+    } while (barrier.blockOr(!sorted));
+}
+}  // namespace traccc::device
diff --git a/device/cuda/src/clusterization/clusterization_algorithm.cu b/device/cuda/src/clusterization/clusterization_algorithm.cu
index 75304bd0a3..57479aa054 100644
--- a/device/cuda/src/clusterization/clusterization_algorithm.cu
+++ b/device/cuda/src/clusterization/clusterization_algorithm.cu
@@ -42,8 +42,7 @@ __global__ void ccl_kernel(
     vecmem::data::vector_view<device::details::index_t> adjv_backup_view,
     unsigned int* backup_mutex_ptr) {
 
-    __shared__ std::size_t partition_start, partition_end;
-    __shared__ std::size_t outi;
+    __shared__ device::details::ccl_kernel_static_smem_parcel shared;
     extern __shared__ device::details::index_t shared_v[];
     vecmem::device_atomic_ref<unsigned int> backup_mutex(*backup_mutex_ptr);
 
@@ -58,9 +57,8 @@ __global__ void ccl_kernel(
     traccc::cuda::barrier barry_r;
     const cuda::thread_id1 thread_id;
 
-    device::ccl_kernel(cfg, thread_id, cells_view, modules_view,
-                       partition_start, partition_end, outi, f_view, gf_view,
-                       f_backup_view, gf_backup_view, adjc_backup_view,
+    device::ccl_kernel(cfg, thread_id, cells_view, modules_view, shared, f_view,
+                       gf_view, f_backup_view, gf_backup_view, adjc_backup_view,
                        adjv_backup_view, backup_mutex, barry_r,
                        measurements_view, cell_links);
 }
@@ -140,6 +138,12 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
         m_gf_backup, m_adjc_backup, m_adjv_backup, m_backup_mutex.get());
     TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());
 
+#ifndef NDEBUG
+    TRACCC_CUDA_ERROR_CHECK(cudaStreamSynchronize(stream));
+    assert(is_contiguous_on(measurement_module_projection(), m_mr.main, m_copy,
+                            m_stream, measurements));
+#endif
+
     // Return the reconstructed measurements.
     return measurements;
 }
diff --git a/device/sycl/src/clusterization/clusterization_algorithm.sycl b/device/sycl/src/clusterization/clusterization_algorithm.sycl
index b1ef6f01d8..96d7e63feb 100644
--- a/device/sycl/src/clusterization/clusterization_algorithm.sycl
+++ b/device/sycl/src/clusterization/clusterization_algorithm.sycl
@@ -117,14 +117,16 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
     details::get_queue(m_queue)
         .submit([&](::sycl::handler& h) {
             // Allocate shared memory for the kernel.
-            vecmem::sycl::local_accessor<std::size_t> shared_uint(3, h);
+            vecmem::sycl::local_accessor<
+                device::details::ccl_kernel_static_smem_parcel>
+                shared(1, h);
             vecmem::sycl::local_accessor<device::details::index_t> shared_idx(
                 2 * m_config.max_partition_size(), h);
 
             // Launch the kernel.
             h.parallel_for<kernels::ccl_kernel>(
                 cclKernelRange,
-                [shared_uint, shared_idx, cells_view, modules_view,
+                [shared, shared_idx, cells_view, modules_view,
                  measurements_view, cell_links_view,
                  f_backup_view = vecmem::get_data(m_f_backup),
                  gf_backup_view = vecmem::get_data(m_gf_backup),
@@ -139,9 +141,6 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
                     vecmem::data::vector_view<device::details::index_t> gf_view{
                         static_cast<vector_size_t>(cfg.max_partition_size()),
                         &shared_idx[cfg.max_partition_size()]};
-                    std::size_t& partition_start = shared_uint[0];
-                    std::size_t& partition_end = shared_uint[1];
-                    std::size_t& outi = shared_uint[2];
 
                     // Mutex for scratch space
                     vecmem::device_atomic_ref<unsigned int> backup_mutex(
@@ -152,16 +151,18 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
                     const sycl::thread_id1 thread_id(item);
 
                     // Run the algorithm for this thread.
-                    device::ccl_kernel(cfg, thread_id, cells_view, modules_view,
-                                       partition_start, partition_end, outi,
-                                       f_view, gf_view, f_backup_view,
-                                       gf_backup_view, adjc_backup_view,
-                                       adjv_backup_view, backup_mutex, barry_r,
-                                       measurements_view, cell_links_view);
+                    device::ccl_kernel(
+                        cfg, thread_id, cells_view, modules_view, shared[0],
+                        f_view, gf_view, f_backup_view, gf_backup_view,
+                        adjc_backup_view, adjv_backup_view, backup_mutex,
+                        barry_r, measurements_view, cell_links_view);
                 });
         })
         .wait_and_throw();
 
+    assert(is_contiguous_on(measurement_module_projection(), m_mr.main, m_copy,
+                            m_queue, measurements));
+
     // Return the reconstructed measurements.
     return measurements;
 }
diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt
index 6b53c7bfa4..458592d5a5 100644
--- a/tests/cuda/CMakeLists.txt
+++ b/tests/cuda/CMakeLists.txt
@@ -44,6 +44,7 @@ traccc_add_test(
     test_unique_lock.cu
     test_sanity_contiguous_on.cu
     test_sanity_ordered_on.cu
+    test_sort.cu
 
     LINK_LIBRARIES
     CUDA::cudart
diff --git a/tests/cuda/test_sort.cu b/tests/cuda/test_sort.cu
new file mode 100644
index 0000000000..dc657070a8
--- /dev/null
+++ b/tests/cuda/test_sort.cu
@@ -0,0 +1,46 @@
+/**
+ * traccc library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#include <gtest/gtest.h>
+
+#include <vecmem/memory/cuda/managed_memory_resource.hpp>
+#include <vecmem/memory/unique_ptr.hpp>
+
+#include "../../cuda/src/utils/barrier.hpp"
+#include "traccc/cuda/utils/thread_id.hpp"
+#include "traccc/device/sort.hpp"
+
+__global__ void testBlockSortKernel(uint32_t *keys, uint32_t n_keys) {
+    traccc::cuda::thread_id1 thread_id;
+    traccc::cuda::barrier barrier;
+    traccc::device::blockOddEvenSort(thread_id, barrier, keys, n_keys,
+                                     std::less<uint32_t>());
+}
+
+TEST(CUDASort, BlockOddEvenSort) {
+    vecmem::cuda::managed_memory_resource mr;
+
+    uint32_t n = 2803;
+    vecmem::unique_alloc_ptr<uint32_t[]> arr =
+        vecmem::make_unique_alloc<uint32_t[]>(mr, n);
+
+    // As long as 13 and n_keys are coprime, this will generate a big,
+    // non-sorted array containing every element.
+    for (uint32_t i = 0; i < n; i++) {
+        arr[i] = (13 * 500 * i) % n;
+    }
+
+    testBlockSortKernel<<<1, 1024u>>>(arr.get(), n);
+
+    ASSERT_EQ(cudaPeekAtLastError(), cudaSuccess);
+    ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+
+    for (uint32_t i = 0; i < n; ++i) {
+        ASSERT_EQ(arr[i], i);
+    }
+}
diff --git a/tests/sycl/CMakeLists.txt b/tests/sycl/CMakeLists.txt
index 931310d20c..63dbf9ff43 100644
--- a/tests/sycl/CMakeLists.txt
+++ b/tests/sycl/CMakeLists.txt
@@ -22,6 +22,7 @@ traccc_add_test(
     test_cca.sycl
     test_sanity_contiguous_on.sycl
     test_sanity_ordered_on.sycl
+    test_sort.sycl
 
     LINK_LIBRARIES
     GTest::gtest_main
diff --git a/tests/sycl/test_sort.sycl b/tests/sycl/test_sort.sycl
new file mode 100644
index 0000000000..eb4a4ef724
--- /dev/null
+++ b/tests/sycl/test_sort.sycl
@@ -0,0 +1,51 @@
+/**
+ * traccc library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#include <gtest/gtest.h>
+
+#include <CL/sycl.hpp>
+#include <vecmem/memory/sycl/shared_memory_resource.hpp>
+#include <vecmem/memory/unique_ptr.hpp>
+
+#include "../../sycl/src/utils/barrier.hpp"
+#include "traccc/device/sort.hpp"
+#include "traccc/sycl/utils/thread_id.hpp"
+
+TEST(SYCLSort, BlockOddEvenSort) {
+    vecmem::sycl::shared_memory_resource mr;
+    cl::sycl::queue queue;
+
+    uint32_t n = 2803;
+    vecmem::unique_alloc_ptr<uint32_t[]> arr =
+        vecmem::make_unique_alloc<uint32_t[]>(mr, n);
+
+    // As long as 13 and n_keys are coprime, this will generate a big,
+    // non-sorted array containing every element.
+    for (uint32_t i = 0; i < n; i++) {
+        arr[i] = (13 * 500 * i) % n;
+    }
+
+    cl::sycl::nd_range test_range(cl::sycl::range<1>(128),
+                                  cl::sycl::range<1>(128));
+
+    queue
+        .submit([&, keys = arr.get()](cl::sycl::handler &h) {
+            h.parallel_for<class BlockOddEvenSortKernel>(
+                test_range, [=](cl::sycl::nd_item<1> item) {
+                    traccc::sycl::thread_id1 thread_id(item);
+                    traccc::sycl::barrier barrier(item);
+                    traccc::device::blockOddEvenSort(thread_id, barrier, keys,
+                                                     n, std::less<uint32_t>());
+                });
+        })
+        .wait_and_throw();
+
+    for (uint32_t i = 0; i < n; ++i) {
+        ASSERT_EQ(arr[i], i);
+    }
+}