acts-project · stephenswat · Jul 31, 2024 · May 27, 2024
diff --git a/core/include/traccc/clusterization/clustering_config.hpp b/core/include/traccc/clusterization/clustering_config.hpp
@@ -0,0 +1,78 @@
+/**
+ * traccc library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include "traccc/definitions/qualifiers.hpp"
+
+namespace traccc {
+/**
+ * @brief Configuration type for massively parallel clustering algorithms.
+ */
+struct clustering_config {
+    /**
+     * @brief The desired number of threads per partition.
+     *
+     * This directly correlates to the block size on most algorithms, so don't
+     * set this too low (which will reduce occupancy due to available thread
+     * slots) or too high (which may not be supported on a device).
+     */
+    unsigned int threads_per_partition;
+
+    /**
+     * @brief The maximum number of cells per thread.
+     *
+     * This sets the maximum thread coarsening factor for the CCA algorithm.
+     * Increasing this value increases shared memory usage and may decrease
+     * occupancy. If this is too low, scratch space will need to be used which
+     * may slow the algorithm down.
+     */
+    unsigned int max_cells_per_thread;
+
+    /**
+     * @brief The desired number of cells per thread.
+     *
+     * This sets the desired thread coarsening factor for the CCA algorithm.
+     * Decreasing this may decrease occupancy. Increasing this increases the
+     * probability that scratch space will need to be used.
+     */
+    unsigned int target_cells_per_thread;
+
+    /**
+     * @brief The upscaling factor for the scratch space.
+     *
+     * The scratch space will be large enough to support partitions this number
+     * of times larger than the maximum partition size determined by
+     * `threads_per_partition` and `max_cells_per_thread`
+     */
+    unsigned int backup_size_multiplier;
+
+    /**
+     * @brief The maximum number of cells per partition.
+     */
+    TRACCC_HOST_DEVICE constexpr std::size_t max_partition_size() const {
+        return threads_per_partition * max_cells_per_thread;
+    }
+
+    /**
+     * @brief The target number of cells per partition.
+     */
+    TRACCC_HOST_DEVICE constexpr std::size_t target_partition_size() const {
+        return threads_per_partition * target_cells_per_thread;
+    }
+
+    /**
+     * @brief The total size of the scratch space, in number of cells.
+     */
+    TRACCC_HOST_DEVICE constexpr std::size_t backup_size() const {
+        return max_partition_size() * backup_size_multiplier;
+    }
+};
+}  // namespace traccc
diff --git a/core/include/traccc/clusterization/clusterization_algorithm.hpp b/core/include/traccc/clusterization/clusterization_algorithm.hpp
@@ -19,6 +19,7 @@
 
 // System include(s).
 #include <functional>
+#include <variant>
 
 namespace traccc::host {
 
@@ -33,6 +34,8 @@ class clusterization_algorithm
           const cell_module_collection_types::const_view&)> {
 
     public:
+    using config_type = std::monostate;
+
     /// Clusterization algorithm constructor
     ///
     /// @param mr The memory resource to use for the result objects

diff --git a/device/alpaka/CMakeLists.txt b/device/alpaka/CMakeLists.txt
@@ -8,8 +8,8 @@
 include( traccc-alpaka-functions )
 include( traccc-compiler-options-cpp )
 
-set(PUBLIC_LIBRARIES traccc::core detray::core detray::utils vecmem::core covfie::core)
-set(PRIVATE_LIBRARIES traccc::device_common alpaka::alpaka traccc::Thrust)
+set(PUBLIC_LIBRARIES traccc::core traccc::device_common detray::core detray::utils vecmem::core covfie::core)
+set(PRIVATE_LIBRARIES alpaka::alpaka traccc::Thrust)
 
 if(alpaka_ACC_GPU_CUDA_ENABLE)
   enable_language(CUDA)

diff --git a/device/alpaka/include/traccc/alpaka/clusterization/clusterization_algorithm.hpp b/device/alpaka/include/traccc/alpaka/clusterization/clusterization_algorithm.hpp
@@ -8,6 +8,8 @@
 #pragma once
 
 // Project include(s).
+#include "traccc/clusterization/clustering_config.hpp"
+#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
 #include "traccc/edm/cell.hpp"
 #include "traccc/edm/measurement.hpp"
 #include "traccc/edm/spacepoint.hpp"
@@ -17,6 +19,9 @@
 // VecMem include(s).
 #include <vecmem/utils/copy.hpp>
 
+// System includes
+#include <mutex>
+
 namespace traccc::alpaka {
 
 /// Algorithm performing hit clusterization
@@ -33,6 +38,9 @@ class clusterization_algorithm
           const cell_module_collection_types::const_view&)> {
 
     public:
+    /// Configuration type
+    using config_type = clustering_config;
+
     /// Constructor for clusterization algorithm
     ///
     /// @param mr The memory resource(s) to use in the algorithm
@@ -42,8 +50,7 @@ class clusterization_algorithm
     /// partition
     ///
     clusterization_algorithm(const traccc::memory_resource& mr,
-                             vecmem::copy& copy,
-                             const unsigned short target_cells_per_partition);
+                             vecmem::copy& copy, const config_type& config);
 
     /// Callable operator for clusterization algorithm
     ///
@@ -57,12 +64,18 @@ class clusterization_algorithm
 
     private:
     /// The average number of cells in each partition
-    using config_type = unsigned short;
-    config_type m_target_cells_per_partition;
+    config_type m_config;
     /// The memory resource(s) to use
     traccc::memory_resource m_mr;
     /// The copy object to use
     std::reference_wrapper<vecmem::copy> m_copy;
+    /// Memory reserved for edge cases
+    vecmem::data::vector_buffer<device::details::index_t> m_f_backup,
+        m_gf_backup;
+    vecmem::data::vector_buffer<unsigned char> m_adjc_backup;
+    vecmem::data::vector_buffer<device::details::index_t> m_adjv_backup;
+    vecmem::unique_alloc_ptr<unsigned int> m_backup_mutex;
+    mutable std::once_flag m_setup_once;
 };
 
 }  // namespace traccc::alpaka
diff --git a/device/alpaka/src/clusterization/clusterization_algorithm.cpp b/device/alpaka/src/clusterization/clusterization_algorithm.cpp
@@ -12,20 +12,26 @@
 #include "../utils/utils.hpp"
 
 // Project include(s)
+#include "traccc/clusterization/clustering_config.hpp"
 #include "traccc/clusterization/device/ccl_kernel.hpp"
 
 // System include(s).
 #include <algorithm>
+#include <mutex>
 
 namespace traccc::alpaka {
 
 struct CCLKernel {
     template <typename TAcc>
     ALPAKA_FN_ACC void operator()(
-        TAcc const& acc, const cell_collection_types::const_view cells_view,
+        TAcc const& acc, const clustering_config cfg,
+        const cell_collection_types::const_view cells_view,
         const cell_module_collection_types::const_view modules_view,
-        const device::details::index_t max_cells_per_partition,
-        const device::details::index_t target_cells_per_partition,
+        vecmem::data::vector_view<device::details::index_t> f_backup_view,
+        vecmem::data::vector_view<device::details::index_t> gf_backup_view,
+        vecmem::data::vector_view<unsigned char> adjc_backup_view,
+        vecmem::data::vector_view<device::details::index_t> adjv_backup_view,
+        uint32_t* backup_mutex_ptr,
         measurement_collection_types::view measurements_view,
         vecmem::data::vector_view<unsigned int> cell_links) const {
 
@@ -37,34 +43,49 @@ struct CCLKernel {
             ::alpaka::getWorkDiv<::alpaka::Block, ::alpaka::Threads>(acc)[0u];
 
         auto& partition_start =
-            ::alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
+            ::alpaka::declareSharedVar<std::size_t, __COUNTER__>(acc);
         auto& partition_end =
-            ::alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
-        auto& outi = ::alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
+            ::alpaka::declareSharedVar<std::size_t, __COUNTER__>(acc);
+        auto& outi = ::alpaka::declareSharedVar<std::size_t, __COUNTER__>(acc);
 
         device::details::index_t* const shared_v =
             ::alpaka::getDynSharedMem<device::details::index_t>(acc);
         vecmem::data::vector_view<device::details::index_t> f_view{
-            max_cells_per_partition, shared_v};
+            static_cast<unsigned int>(cfg.max_partition_size()), shared_v};
         vecmem::data::vector_view<device::details::index_t> gf_view{
-            max_cells_per_partition, shared_v + max_cells_per_partition};
+            static_cast<unsigned int>(cfg.max_partition_size()),
+            shared_v + cfg.max_partition_size()};
+
+        vecmem::device_atomic_ref<uint32_t> backup_mutex(*backup_mutex_ptr);
 
         alpaka::barrier<TAcc> barry_r(&acc);
 
-        device::ccl_kernel(localThreadIdx, blockExtent, localBlockIdx,
-                           cells_view, modules_view, max_cells_per_partition,
-                           target_cells_per_partition, partition_start,
-                           partition_end, outi, f_view, gf_view, barry_r,
-                           measurements_view, cell_links);
+        device::ccl_kernel(
+            cfg, localThreadIdx, blockExtent, localBlockIdx, cells_view,
+            modules_view, partition_start, partition_end, outi, f_view, gf_view,
+            f_backup_view, gf_backup_view, adjc_backup_view, adjv_backup_view,
+            backup_mutex, barry_r, measurements_view, cell_links);
+    }
+};
+
+struct ZeroMutexKernel {
+    template <typename TAcc>
+    ALPAKA_FN_ACC void operator()(TAcc const&, uint32_t* ptr) const {
+        *ptr = 0;
     }
 };
 
 clusterization_algorithm::clusterization_algorithm(
     const traccc::memory_resource& mr, vecmem::copy& copy,
-    const unsigned short target_cells_per_partition)
-    : m_target_cells_per_partition(target_cells_per_partition),
+    const config_type& config)
+    : m_config(config),
       m_mr(mr),
-      m_copy(copy) {}
+      m_copy(copy),
+      m_f_backup(m_config.backup_size(), m_mr.main),
+      m_gf_backup(m_config.backup_size(), m_mr.main),
+      m_adjc_backup(m_config.backup_size(), m_mr.main),
+      m_adjv_backup(m_config.backup_size() * 8, m_mr.main),
+      m_backup_mutex(vecmem::make_unique_alloc<unsigned int>(m_mr.main)) {}
 
 clusterization_algorithm::output_type clusterization_algorithm::operator()(
     const cell_collection_types::const_view& cells,
@@ -74,6 +95,13 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
     auto devAcc = ::alpaka::getDevByIdx(::alpaka::Platform<Acc>{}, 0u);
     auto queue = Queue{devAcc};
 
+    // Setup the mutex, if it is not already setup.
+    std::call_once(m_setup_once, [&queue, mutex_ptr = m_backup_mutex.get()]() {
+        auto workDiv = makeWorkDiv<Acc>(1, 1);
+        ::alpaka::exec<Acc>(queue, workDiv, ZeroMutexKernel{}, mutex_ptr);
+        ::alpaka::wait(queue);
+    });
+
     // Number of cells
     const cell_collection_types::view::size_type num_cells =
         m_copy.get().get_size(cells);
@@ -97,18 +125,20 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
     m_copy.get().setup(cell_links)->ignore();
 
     // Launch ccl kernel. Each thread will handle a single cell.
-    const device::details::ccl_kernel_helper helper{
-        m_target_cells_per_partition, num_cells};
+    std::size_t num_blocks =
+        (num_cells + (m_config.target_partition_size()) - 1) /
+        m_config.target_partition_size();
     static_assert(accSupportsMultiThreadBlocks<Acc>(),
                   "Clustering algorithm must be compiled for an accelerator "
                   "with support for multi-thread blocks.");
-    auto workDiv =
-        makeWorkDiv<Acc>(helper.num_partitions, helper.threads_per_partition);
+    auto workDiv = makeWorkDiv<Acc>(num_blocks, m_config.threads_per_partition);
 
     ::alpaka::exec<Acc>(
-        queue, workDiv, CCLKernel{}, cells, modules,
-        helper.max_cells_per_partition, m_target_cells_per_partition,
-        vecmem::get_data(measurements), vecmem::get_data(cell_links));
+        queue, workDiv, CCLKernel{}, m_config, cells, modules,
+        vecmem::get_data(m_f_backup), vecmem::get_data(m_gf_backup),
+        vecmem::get_data(m_adjc_backup), vecmem::get_data(m_adjv_backup),
+        m_backup_mutex.get(), vecmem::get_data(measurements),
+        vecmem::get_data(cell_links));
     ::alpaka::wait(queue);
 
     return measurements;
@@ -125,12 +155,9 @@ struct BlockSharedMemDynSizeBytes<traccc::alpaka::CCLKernel, TAcc> {
     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
         traccc::alpaka::CCLKernel const& /* kernel */,
         TVec const& /* blockThreadExtent */, TVec const& /* threadElemExtent */,
-        const traccc::cell_collection_types::const_view /* cells_view */,
-        const traccc::cell_module_collection_types::
-            const_view /* modules_view */,
-        const unsigned short max_cells_per_partition, TArgs const&... /* args */
+        const traccc::clustering_config config, TArgs const&... /* args */
         ) -> std::size_t {
-        return static_cast<std::size_t>(2 * max_cells_per_partition *
+        return static_cast<std::size_t>(2 * config.max_partition_size() *
                                         sizeof(unsigned short));
     }
 };

diff --git a/device/common/include/traccc/clusterization/device/aggregate_cluster.hpp b/device/common/include/traccc/clusterization/device/aggregate_cluster.hpp
@@ -33,8 +33,8 @@ TRACCC_HOST_DEVICE
 inline void aggregate_cluster(
     const cell_collection_types::const_device& cells,
     const cell_module_collection_types::const_device& modules,
-    const vecmem::data::vector_view<const unsigned short>& f_view,
-    unsigned int start, unsigned int end, unsigned short cid, measurement& out,
+    const vecmem::device_vector<details::index_t>& f_view, unsigned int start,
+    unsigned int end, unsigned short cid, measurement& out,
     vecmem::data::vector_view<unsigned int> cell_links, unsigned int link);
 
 }  // namespace traccc::device