acts-project
diff --git a/‎core/include/traccc/clusterization/clustering_config.hpp
+24 b/‎core/include/traccc/clusterization/clustering_config.hpp
+24
diff --git a/‎core/include/traccc/clusterization/clusterization_algorithm.hpp
+3 b/‎core/include/traccc/clusterization/clusterization_algorithm.hpp
+3
diff --git a/‎device/common/include/traccc/clusterization/device/ccl_kernel.hpp
+29-17 b/‎device/common/include/traccc/clusterization/device/ccl_kernel.hpp
+29-17
diff --git a/‎device/common/include/traccc/clusterization/device/ccl_kernel_definitions.hpp
+15 b/‎device/common/include/traccc/clusterization/device/ccl_kernel_definitions.hpp
+15
diff --git a/‎device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp
+73-32 b/‎device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp
+73-32
diff --git a/‎device/common/include/traccc/clusterization/device/impl/reduce_problem_cell.ipp
+1-1 b/‎device/common/include/traccc/clusterization/device/impl/reduce_problem_cell.ipp
+1-1
diff --git a/‎device/common/include/traccc/clusterization/device/reduce_problem_cell.hpp
+1-1 b/‎device/common/include/traccc/clusterization/device/reduce_problem_cell.hpp
+1-1
@@ -0,0 +1,24 @@
+/**
+ * traccc library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+namespace traccc {
+struct clustering_config {
+    clustering_config(unsigned int _target_cells_per_partition = 2048,
+                      unsigned int _target_cells_per_thread = 8,
+                      unsigned int _backup_partition_size = 256 * 4096)
+        : target_cells_per_partition(_target_cells_per_partition),
+          target_cells_per_thread(_target_cells_per_thread),
+          backup_partition_size(_backup_partition_size) {}
+
+    unsigned int target_cells_per_partition;
+    unsigned int target_cells_per_thread;
+    unsigned int backup_partition_size;
+};
+}  // namespace traccc
@@ -8,6 +8,7 @@
 #pragma once
 
 // Library include(s).
+#include "traccc/clusterization/clustering_config.hpp"
 #include "traccc/clusterization/measurement_creation_algorithm.hpp"
 #include "traccc/clusterization/sparse_ccl_algorithm.hpp"
 #include "traccc/edm/cell.hpp"
@@ -33,6 +34,8 @@ class clusterization_algorithm
           const cell_module_collection_types::const_view&)> {
 
     public:
+    using config_type = clustering_config;
+
     /// Clusterization algorithm constructor
     ///
     /// @param mr The memory resource to use for the result objects
 
@@ -8,6 +8,7 @@
 #pragma once
 
 // Project include(s).
+#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
 #include "traccc/definitions/qualifiers.hpp"
 #include "traccc/edm/cell.hpp"
 #include "traccc/edm/measurement.hpp"
@@ -23,33 +24,29 @@
 namespace traccc::device {
 
 namespace details {
-
-/// These indices in clusterization will only range from 0 to
-/// max_cells_per_partition, so we only need a short
-using index_t = unsigned short;
-
-static constexpr int TARGET_CELLS_PER_THREAD = 8;
-static constexpr int MAX_CELLS_PER_THREAD = 32;
+static constexpr int MAX_CELLS_PER_THREAD = 16;
 
 /// Helper struct for calculating some of the input parameters of @c ccl_kernel
 struct ccl_kernel_helper {
-
     /// Constructor setting the helper parameters
     ///
     /// @param[in] target_cells_per_partition Target average number of cells per
-    ///                                       thread block
+    ///     thread block
+    /// @param[in] target_cells_per_thread Target average number of cells per
+    ///     thread
     /// @param[in] n_cells Total number of cells
-    ///
     ccl_kernel_helper(index_t target_cells_per_partition,
-                      unsigned int n_cells) {
-
+                      index_t target_cells_per_thread, unsigned int n_cells) {
+        /// Shared memory size
         max_cells_per_partition =
             (target_cells_per_partition * MAX_CELLS_PER_THREAD +
-             TARGET_CELLS_PER_THREAD - 1) /
-            TARGET_CELLS_PER_THREAD;
+             target_cells_per_thread - 1) /
+            target_cells_per_thread;
+        /// Block size
         threads_per_partition =
-            (target_cells_per_partition + TARGET_CELLS_PER_THREAD - 1) /
-            TARGET_CELLS_PER_THREAD;
+            (target_cells_per_partition + target_cells_per_thread - 1) /
+            target_cells_per_thread;
+        /// Grid size
         num_partitions = (n_cells + target_cells_per_partition - 1) /
                          target_cells_per_partition;
     }
@@ -81,6 +78,16 @@ struct ccl_kernel_helper {
 /// @param f_view  array of "parent" indices for all cells in this partition
 /// @param gf_view array of "grandparent" indices for all cells in this
 ///                partition
+/// @param f_backup_view global memory alternative to `f_view` for cases in
+///     which that array is not large enough
+/// @param gf_backup_view global memory alternative to `gf_view` for cases in
+///     which that array is not large enough
+/// @param adjc_backup_view global memory alternative to the adjacent cell
+///     count vector
+/// @param adjv_backup_view global memory alternative to the cell adjacency
+///     matrix fragment storage
+/// @param backup_mutex mutex lock to mediate control over the backup global
+///     memory data structures.
 /// @param barrier  A generic object for block-wide synchronisation
 /// @param[out] measurements_view collection of measurements
 /// @param[out] cell_links    collection of links to measurements each cell is
@@ -94,7 +101,12 @@ TRACCC_DEVICE inline void ccl_kernel(
     const details::index_t target_cells_per_partition,
     unsigned int& partition_start, unsigned int& partition_end,
     unsigned int& outi, vecmem::data::vector_view<details::index_t> f_view,
-    vecmem::data::vector_view<details::index_t> gf_view, barrier_t& barrier,
+    vecmem::data::vector_view<details::index_t> gf_view,
+    vecmem::data::vector_view<details::index_t> f_backup_view,
+    vecmem::data::vector_view<details::index_t> gf_backup_view,
+    vecmem::data::vector_view<unsigned char> adjc_backup_view,
+    vecmem::data::vector_view<details::index_t> adjv_backup_view,
+    vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
     measurement_collection_types::view measurements_view,
     vecmem::data::vector_view<unsigned int> cell_links);
 
 
@@ -0,0 +1,15 @@
+/**
+ * traccc library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+namespace traccc::device::details {
+/// These indices in clusterization will only range from 0 to
+/// max_cells_per_partition, so we only need a short
+using index_t = unsigned short;
+}  // namespace traccc::device::details
@@ -9,6 +9,7 @@
 
 #include "traccc/clusterization/device/aggregate_cluster.hpp"
 #include "traccc/clusterization/device/reduce_problem_cell.hpp"
+#include "vecmem/memory/device_atomic_ref.hpp"
 
 namespace traccc::device {
 
@@ -33,13 +34,13 @@ namespace traccc::device {
 /// @param[in] barrier  A generic object for block-wide synchronisation
 ///
 template <typename barrier_t>
-TRACCC_DEVICE void fast_sv_1(
-    vecmem::device_vector<details::index_t>& f,
-    vecmem::device_vector<details::index_t>& gf,
-    unsigned char adjc[details::MAX_CELLS_PER_THREAD],
-    details::index_t adjv[details::MAX_CELLS_PER_THREAD][8],
-    const details::index_t tid, const details::index_t blckDim,
-    barrier_t& barrier) {
+TRACCC_DEVICE void fast_sv_1(vecmem::device_vector<details::index_t>& f,
+                             vecmem::device_vector<details::index_t>& gf,
+                             unsigned char* adjc, details::index_t* adjv,
+                             details::index_t thread_cell_count,
+                             const details::index_t tid,
+                             const details::index_t blckDim,
+                             barrier_t& barrier) {
     /*
      * The algorithm finishes if an iteration leaves the arrays unchanged.
      * This varible will be set if a change is made, and dictates if another
@@ -61,13 +62,12 @@ TRACCC_DEVICE void fast_sv_1(
          * cluster ID if it is lower than ours, essentially merging the two
          * together.
          */
-        for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD;
-             ++tst) {
+        for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
             const details::index_t cid = tst * blckDim + tid;
 
             __builtin_assume(adjc[tst] <= 8);
             for (unsigned char k = 0; k < adjc[tst]; ++k) {
-                details::index_t q = gf.at(adjv[tst][k]);
+                details::index_t q = gf.at(adjv[8 * tst + k]);
 
                 if (gf.at(cid) > q) {
                     f.at(f.at(cid)) = q;
@@ -82,9 +82,7 @@ TRACCC_DEVICE void fast_sv_1(
          */
         barrier.blockBarrier();
 
-#pragma unroll
-        for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD;
-             ++tst) {
+        for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
             const details::index_t cid = tst * blckDim + tid;
             /*
              * The second stage is shortcutting, which is an optimisation that
@@ -101,9 +99,7 @@ TRACCC_DEVICE void fast_sv_1(
          */
         barrier.blockBarrier();
 
-#pragma unroll
-        for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD;
-             ++tst) {
+        for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
             const details::index_t cid = tst * blckDim + tid;
             /*
              * Update the array for the next generation, keeping track of any
@@ -135,17 +131,24 @@ TRACCC_DEVICE inline void ccl_kernel(
     const details::index_t target_cells_per_partition,
     unsigned int& partition_start, unsigned int& partition_end,
     unsigned int& outi, vecmem::data::vector_view<details::index_t> f_view,
-    vecmem::data::vector_view<details::index_t> gf_view, barrier_t& barrier,
+    vecmem::data::vector_view<details::index_t> gf_view,
+    vecmem::data::vector_view<details::index_t> f_backup_view,
+    vecmem::data::vector_view<details::index_t> gf_backup_view,
+    vecmem::data::vector_view<unsigned char> adjc_backup_view,
+    vecmem::data::vector_view<details::index_t> adjv_backup_view,
+    vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
     measurement_collection_types::view measurements_view,
     vecmem::data::vector_view<unsigned int> cell_links) {
-
     // Construct device containers around the views.
     const cell_collection_types::const_device cells_device(cells_view);
     const cell_module_collection_types::const_device modules_device(
         modules_view);
     measurement_collection_types::device measurements_device(measurements_view);
     vecmem::device_vector<details::index_t> f(f_view);
     vecmem::device_vector<details::index_t> gf(gf_view);
+    vecmem::device_vector<unsigned char> adjc_backup(adjc_backup_view);
+    vecmem::device_vector<details::index_t> adjv_backup(adjv_backup_view);
+    bool using_backup_memory = false;
 
     const cell_collection_types::const_device::size_type num_cells =
         cells_device.size();
@@ -199,41 +202,71 @@ TRACCC_DEVICE inline void ccl_kernel(
     barrier.blockBarrier();
 
     // Vector of indices of the adjacent cells
-    details::index_t adjv[details::MAX_CELLS_PER_THREAD][8];
+    details::index_t _adjv[details::MAX_CELLS_PER_THREAD * 8];
+    details::index_t* adjv = _adjv;
+
     /*
      * The number of adjacent cells for each cell must start at zero, to
      * avoid uninitialized memory. adjv does not need to be zeroed, as
      * we will only access those values if adjc indicates that the value
      * is set.
      */
     // Number of adjacent cells
-    unsigned char adjc[details::MAX_CELLS_PER_THREAD];
+    unsigned char _adjc[details::MAX_CELLS_PER_THREAD];
+    unsigned char* adjc = _adjc;
 
     // It seems that sycl runs into undefined behaviour when calling
     // group synchronisation functions when some threads have already run
     // into a return. As such, we cannot use returns in this kernel.
 
     // Get partition for this thread group
     const details::index_t size = partition_end - partition_start;
-    assert(size <= max_cells_per_partition);
 
-#pragma unroll
-    for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD; ++tst) {
+    // If our partition is too large, we need to handle this specific edge
+    // case. The first thread of the block will attempt to enter a critical
+    // section by obtaining a lock on a mutex in global memory. When this is
+    // obtained, we can use some memory in global memory instead of the shared
+    // memory. This can be done more efficiently, but this should be a very
+    // rare edge case.
+    if (size > max_cells_per_partition) {
+        if (threadId == 0) {
+            uint32_t false_int = 0;
+            while (backup_mutex.compare_exchange_strong(false_int, 1u)) {
+            }
+        }
+
+        barrier.blockBarrier();
+
+        f = f_backup_view;
+        gf = gf_backup_view;
+        adjc = adjc_backup.data();
+        adjv = adjv_backup.data();
+        using_backup_memory = true;
+    }
+
+    assert(size <= f.size());
+    assert(size <= gf.size());
+
+    details::index_t thread_cell_count = 0;
+    for (details::index_t cid;
+         (cid = thread_cell_count * blckDim + threadId) < size;
+         ++thread_cell_count) {
+    }
+
+    for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
         adjc[tst] = 0;
     }
 
-    for (details::index_t tst = 0, cid; (cid = tst * blckDim + threadId) < size;
-         ++tst) {
+    for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
         /*
          * Look for adjacent cells to the current one.
          */
-        assert(tst < details::MAX_CELLS_PER_THREAD);
+        const details::index_t cid = tst * blckDim + threadId;
         reduce_problem_cell(cells_device, cid, partition_start, partition_end,
-                            adjc[tst], adjv[tst]);
+                            adjc[tst], &adjv[8 * tst]);
     }
 
-#pragma unroll
-    for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD; ++tst) {
+    for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
         const details::index_t cid = tst * blckDim + threadId;
         /*
          * At the start, the values of f and gf should be equal to the
@@ -253,12 +286,13 @@ TRACCC_DEVICE inline void ccl_kernel(
      * Run FastSV algorithm, which will update the father index to that of
      * the cell belonging to the same cluster with the lowest index.
      */
-    fast_sv_1(f, gf, adjc, adjv, threadId, blckDim, barrier);
+    fast_sv_1(f, gf, &adjc[0], &adjv[0], thread_cell_count, threadId, blckDim,
+              barrier);
 
     barrier.blockBarrier();
 
-    for (details::index_t tst = 0, cid; (cid = tst * blckDim + threadId) < size;
-         ++tst) {
+    for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
+        const details::index_t cid = tst * blckDim + threadId;
         if (f.at(cid) == cid) {
             // Add a new measurement to the output buffer. Remembering its
             // position inside of the container.
@@ -271,6 +305,13 @@ TRACCC_DEVICE inline void ccl_kernel(
                               meas_pos);
         }
     }
+
+    // Recall that we might be holding a mutex on some global memory. If we
+    // are, make sure to release it here so that any future kernels trying to
+    // use that memory don't get stuck in a loop forever.
+    if (threadId == 0 && using_backup_memory) {
+        backup_mutex.store(0);
+    }
 }
 
 }  // namespace traccc::device
@@ -19,7 +19,7 @@ TRACCC_HOST_DEVICE
 inline void reduce_problem_cell(
     const cell_collection_types::const_device& cells, const unsigned short cid,
     const unsigned int start, const unsigned int end, unsigned char& adjc,
-    unsigned short adjv[8]) {
+    unsigned short* adjv) {
 
     // Some sanity check(s).
     assert(start <= end);
 
@@ -28,7 +28,7 @@ TRACCC_HOST_DEVICE
 inline void reduce_problem_cell(
     const cell_collection_types::const_device& cells, unsigned short cid,
     unsigned int start, unsigned int end, unsigned char& adjc,
-    unsigned short adjv[8]);
+    unsigned short* adjv);
 
 }  // namespace traccc::device