Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve robustness and performance of CCL #595

Merged
merged 1 commit into from
Jul 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions core/include/traccc/clusterization/clustering_config.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/**
* traccc library, part of the ACTS project (R&D line)
*
* (c) 2024 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/

#pragma once

#include <cstdint>

#include "traccc/definitions/qualifiers.hpp"

namespace traccc {
/**
* @brief Configuration type for massively parallel clustering algorithms.
*/
struct clustering_config {
/**
* @brief The desired number of threads per partition.
*
* This directly correlates to the block size on most algorithms, so don't
* set this too low (which will reduce occupancy due to available thread
* slots) or too high (which may not be supported on a device).
*/
unsigned int threads_per_partition;

/**
* @brief The maximum number of cells per thread.
*
* This sets the maximum thread coarsening factor for the CCA algorithm.
* Increasing this value increases shared memory usage and may decrease
* occupancy. If this is too low, scratch space will need to be used which
* may slow the algorithm down.
*/
unsigned int max_cells_per_thread;

/**
* @brief The desired number of cells per thread.
*
* This sets the desired thread coarsening factor for the CCA algorithm.
* Decreasing this may decrease occupancy. Increasing this increases the
* probability that scratch space will need to be used.
*/
unsigned int target_cells_per_thread;

/**
* @brief The upscaling factor for the scratch space.
*
* The scratch space will be large enough to support partitions this number
* of times larger than the maximum partition size determined by
* `threads_per_partition` and `max_cells_per_thread`
*/
unsigned int backup_size_multiplier;

/**
* @brief The maximum number of cells per partition.
*/
TRACCC_HOST_DEVICE constexpr std::size_t max_partition_size() const {
return threads_per_partition * max_cells_per_thread;
}

/**
* @brief The target number of cells per partition.
*/
TRACCC_HOST_DEVICE constexpr std::size_t target_partition_size() const {
return threads_per_partition * target_cells_per_thread;
}

/**
* @brief The total size of the scratch space, in number of cells.
*/
TRACCC_HOST_DEVICE constexpr std::size_t backup_size() const {
return max_partition_size() * backup_size_multiplier;
}
};
} // namespace traccc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

// System include(s).
#include <functional>
#include <variant>

namespace traccc::host {

Expand All @@ -33,6 +34,8 @@ class clusterization_algorithm
const cell_module_collection_types::const_view&)> {

public:
using config_type = std::monostate;

/// Clusterization algorithm constructor
///
/// @param mr The memory resource to use for the result objects
Expand Down
4 changes: 2 additions & 2 deletions device/alpaka/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
include( traccc-alpaka-functions )
include( traccc-compiler-options-cpp )

set(PUBLIC_LIBRARIES traccc::core detray::core detray::utils vecmem::core covfie::core)
set(PRIVATE_LIBRARIES traccc::device_common alpaka::alpaka traccc::Thrust)
set(PUBLIC_LIBRARIES traccc::core traccc::device_common detray::core detray::utils vecmem::core covfie::core)
set(PRIVATE_LIBRARIES alpaka::alpaka traccc::Thrust)

if(alpaka_ACC_GPU_CUDA_ENABLE)
enable_language(CUDA)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#pragma once

// Project include(s).
#include "traccc/clusterization/clustering_config.hpp"
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
#include "traccc/edm/cell.hpp"
#include "traccc/edm/measurement.hpp"
#include "traccc/edm/spacepoint.hpp"
Expand All @@ -17,6 +19,9 @@
// VecMem include(s).
#include <vecmem/utils/copy.hpp>

// System includes
#include <mutex>

namespace traccc::alpaka {

/// Algorithm performing hit clusterization
Expand All @@ -33,6 +38,9 @@ class clusterization_algorithm
const cell_module_collection_types::const_view&)> {

public:
/// Configuration type
using config_type = clustering_config;

/// Constructor for clusterization algorithm
///
/// @param mr The memory resource(s) to use in the algorithm
Expand All @@ -42,8 +50,7 @@ class clusterization_algorithm
/// partition
///
clusterization_algorithm(const traccc::memory_resource& mr,
vecmem::copy& copy,
const unsigned short target_cells_per_partition);
vecmem::copy& copy, const config_type& config);

/// Callable operator for clusterization algorithm
///
Expand All @@ -57,12 +64,18 @@ class clusterization_algorithm

private:
/// The average number of cells in each partition
using config_type = unsigned short;
config_type m_target_cells_per_partition;
config_type m_config;
/// The memory resource(s) to use
traccc::memory_resource m_mr;
/// The copy object to use
std::reference_wrapper<vecmem::copy> m_copy;
/// Memory reserved for edge cases
vecmem::data::vector_buffer<device::details::index_t> m_f_backup,
m_gf_backup;
vecmem::data::vector_buffer<unsigned char> m_adjc_backup;
vecmem::data::vector_buffer<device::details::index_t> m_adjv_backup;
vecmem::unique_alloc_ptr<unsigned int> m_backup_mutex;
mutable std::once_flag m_setup_once;
};

} // namespace traccc::alpaka
83 changes: 55 additions & 28 deletions device/alpaka/src/clusterization/clusterization_algorithm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,26 @@
#include "../utils/utils.hpp"

// Project include(s)
#include "traccc/clusterization/clustering_config.hpp"
#include "traccc/clusterization/device/ccl_kernel.hpp"

// System include(s).
#include <algorithm>
#include <mutex>

namespace traccc::alpaka {

struct CCLKernel {
template <typename TAcc>
ALPAKA_FN_ACC void operator()(
TAcc const& acc, const cell_collection_types::const_view cells_view,
TAcc const& acc, const clustering_config cfg,
const cell_collection_types::const_view cells_view,
const cell_module_collection_types::const_view modules_view,
const device::details::index_t max_cells_per_partition,
const device::details::index_t target_cells_per_partition,
vecmem::data::vector_view<device::details::index_t> f_backup_view,
vecmem::data::vector_view<device::details::index_t> gf_backup_view,
vecmem::data::vector_view<unsigned char> adjc_backup_view,
vecmem::data::vector_view<device::details::index_t> adjv_backup_view,
uint32_t* backup_mutex_ptr,
measurement_collection_types::view measurements_view,
vecmem::data::vector_view<unsigned int> cell_links) const {

Expand All @@ -37,34 +43,49 @@ struct CCLKernel {
::alpaka::getWorkDiv<::alpaka::Block, ::alpaka::Threads>(acc)[0u];

auto& partition_start =
::alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
::alpaka::declareSharedVar<std::size_t, __COUNTER__>(acc);
auto& partition_end =
::alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
auto& outi = ::alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
::alpaka::declareSharedVar<std::size_t, __COUNTER__>(acc);
auto& outi = ::alpaka::declareSharedVar<std::size_t, __COUNTER__>(acc);

device::details::index_t* const shared_v =
::alpaka::getDynSharedMem<device::details::index_t>(acc);
vecmem::data::vector_view<device::details::index_t> f_view{
max_cells_per_partition, shared_v};
static_cast<unsigned int>(cfg.max_partition_size()), shared_v};
vecmem::data::vector_view<device::details::index_t> gf_view{
max_cells_per_partition, shared_v + max_cells_per_partition};
static_cast<unsigned int>(cfg.max_partition_size()),
shared_v + cfg.max_partition_size()};

vecmem::device_atomic_ref<uint32_t> backup_mutex(*backup_mutex_ptr);

alpaka::barrier<TAcc> barry_r(&acc);

device::ccl_kernel(localThreadIdx, blockExtent, localBlockIdx,
cells_view, modules_view, max_cells_per_partition,
target_cells_per_partition, partition_start,
partition_end, outi, f_view, gf_view, barry_r,
measurements_view, cell_links);
device::ccl_kernel(
cfg, localThreadIdx, blockExtent, localBlockIdx, cells_view,
modules_view, partition_start, partition_end, outi, f_view, gf_view,
f_backup_view, gf_backup_view, adjc_backup_view, adjv_backup_view,
backup_mutex, barry_r, measurements_view, cell_links);
}
};

struct ZeroMutexKernel {
template <typename TAcc>
ALPAKA_FN_ACC void operator()(TAcc const&, uint32_t* ptr) const {
*ptr = 0;
}
};

clusterization_algorithm::clusterization_algorithm(
const traccc::memory_resource& mr, vecmem::copy& copy,
const unsigned short target_cells_per_partition)
: m_target_cells_per_partition(target_cells_per_partition),
const config_type& config)
: m_config(config),
m_mr(mr),
m_copy(copy) {}
m_copy(copy),
m_f_backup(m_config.backup_size(), m_mr.main),
m_gf_backup(m_config.backup_size(), m_mr.main),
m_adjc_backup(m_config.backup_size(), m_mr.main),
m_adjv_backup(m_config.backup_size() * 8, m_mr.main),
m_backup_mutex(vecmem::make_unique_alloc<unsigned int>(m_mr.main)) {}

clusterization_algorithm::output_type clusterization_algorithm::operator()(
const cell_collection_types::const_view& cells,
Expand All @@ -74,6 +95,13 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
auto devAcc = ::alpaka::getDevByIdx(::alpaka::Platform<Acc>{}, 0u);
auto queue = Queue{devAcc};

// Setup the mutex, if it is not already setup.
std::call_once(m_setup_once, [&queue, mutex_ptr = m_backup_mutex.get()]() {
auto workDiv = makeWorkDiv<Acc>(1, 1);
::alpaka::exec<Acc>(queue, workDiv, ZeroMutexKernel{}, mutex_ptr);
::alpaka::wait(queue);
});

// Number of cells
const cell_collection_types::view::size_type num_cells =
m_copy.get().get_size(cells);
Expand All @@ -97,18 +125,20 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
m_copy.get().setup(cell_links)->ignore();

// Launch ccl kernel. Each thread will handle a single cell.
const device::details::ccl_kernel_helper helper{
m_target_cells_per_partition, num_cells};
std::size_t num_blocks =
(num_cells + (m_config.target_partition_size()) - 1) /
m_config.target_partition_size();
static_assert(accSupportsMultiThreadBlocks<Acc>(),
"Clustering algorithm must be compiled for an accelerator "
"with support for multi-thread blocks.");
auto workDiv =
makeWorkDiv<Acc>(helper.num_partitions, helper.threads_per_partition);
auto workDiv = makeWorkDiv<Acc>(num_blocks, m_config.threads_per_partition);

::alpaka::exec<Acc>(
queue, workDiv, CCLKernel{}, cells, modules,
helper.max_cells_per_partition, m_target_cells_per_partition,
vecmem::get_data(measurements), vecmem::get_data(cell_links));
queue, workDiv, CCLKernel{}, m_config, cells, modules,
vecmem::get_data(m_f_backup), vecmem::get_data(m_gf_backup),
vecmem::get_data(m_adjc_backup), vecmem::get_data(m_adjv_backup),
m_backup_mutex.get(), vecmem::get_data(measurements),
vecmem::get_data(cell_links));
::alpaka::wait(queue);

return measurements;
Expand All @@ -125,12 +155,9 @@ struct BlockSharedMemDynSizeBytes<traccc::alpaka::CCLKernel, TAcc> {
ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
traccc::alpaka::CCLKernel const& /* kernel */,
TVec const& /* blockThreadExtent */, TVec const& /* threadElemExtent */,
const traccc::cell_collection_types::const_view /* cells_view */,
const traccc::cell_module_collection_types::
const_view /* modules_view */,
const unsigned short max_cells_per_partition, TArgs const&... /* args */
const traccc::clustering_config config, TArgs const&... /* args */
) -> std::size_t {
return static_cast<std::size_t>(2 * max_cells_per_partition *
return static_cast<std::size_t>(2 * config.max_partition_size() *
sizeof(unsigned short));
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ TRACCC_HOST_DEVICE
inline void aggregate_cluster(
const cell_collection_types::const_device& cells,
const cell_module_collection_types::const_device& modules,
const vecmem::data::vector_view<const unsigned short>& f_view,
unsigned int start, unsigned int end, unsigned short cid, measurement& out,
const vecmem::device_vector<details::index_t>& f_view, unsigned int start,
unsigned int end, unsigned short cid, measurement& out,
vecmem::data::vector_view<unsigned int> cell_links, unsigned int link);

} // namespace traccc::device
Expand Down
Loading
Loading