Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SYCL Tweaks, main branch (2024.12.07.) #799

Merged
merged 4 commits into from
Dec 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions device/alpaka/src/utils/barrier.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/** TRACCC library, part of the ACTS project (R&D line)
*
* (c) 2023 CERN for the benefit of the ACTS project
* (c) 2023-2024 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/
Expand All @@ -21,22 +21,22 @@ struct barrier {
ALPAKA_FN_INLINE ALPAKA_FN_ACC barrier(const TAcc* acc) : m_acc(acc){};

ALPAKA_FN_ACC
void blockBarrier() { ::alpaka::syncBlockThreads(*m_acc); }
void blockBarrier() const { ::alpaka::syncBlockThreads(*m_acc); }

ALPAKA_FN_ACC
bool blockOr(bool predicate) {
bool blockOr(bool predicate) const {
return ::alpaka::syncBlockThreadsPredicate<::alpaka::BlockOr>(
*m_acc, predicate);
}

ALPAKA_FN_ACC
bool blockAnd(bool predicate) {
bool blockAnd(bool predicate) const {
return ::alpaka::syncBlockThreadsPredicate<::alpaka::BlockAnd>(
*m_acc, predicate);
}

ALPAKA_FN_ACC
bool blockCount(int threadCount) {
bool blockCount(int threadCount) const {
return ::alpaka::syncBlockThreadsPredicate<::alpaka::BlockCount>(
*m_acc, threadCount);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ TRACCC_DEVICE inline void ccl_kernel(
vecmem::data::vector_view<details::index_t> gf_backup_view,
vecmem::data::vector_view<unsigned char> adjc_backup_view,
vecmem::data::vector_view<details::index_t> adjv_backup_view,
vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
vecmem::device_atomic_ref<uint32_t> backup_mutex, const barrier_t& barrier,
measurement_collection_types::view measurements_view,
vecmem::data::vector_view<unsigned int> cell_links);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ TRACCC_DEVICE inline void ccl_core(
const edm::silicon_cell_collection::const_device& cells_device,
const silicon_detector_description::const_device& det_descr,
measurement_collection_types::device measurements_device,
barrier_t& barrier) {
const barrier_t& barrier) {
const auto size =
static_cast<details::index_t>(partition_end - partition_start);

Expand Down Expand Up @@ -224,7 +224,7 @@ TRACCC_DEVICE inline void ccl_kernel(
vecmem::data::vector_view<details::index_t> gf_backup_view,
vecmem::data::vector_view<unsigned char> adjc_backup_view,
vecmem::data::vector_view<details::index_t> adjv_backup_view,
vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
vecmem::device_atomic_ref<uint32_t> backup_mutex, const barrier_t& barrier,
measurement_collection_types::view measurements_view,
vecmem::data::vector_view<unsigned int> cell_links) {

Expand Down
5 changes: 3 additions & 2 deletions device/common/include/traccc/device/sort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ TRACCC_DEVICE void swap(T& a, T& b) {
*/
template <concepts::thread_id1 T, concepts::barrier B, std::movable K,
std::strict_weak_order<K, K> C>
TRACCC_DEVICE void blockOddEvenSort(T& thread_id, B& barrier, K* keys,
uint32_t num_keys, C&& comparison) {
TRACCC_DEVICE void blockOddEvenSort(const T& thread_id, const B& barrier,
K* keys, uint32_t num_keys,
C&& comparison) {
bool sorted;

do {
Expand Down
23 changes: 11 additions & 12 deletions device/cuda/src/utils/barrier.hpp
Original file line number Diff line number Diff line change
@@ -1,29 +1,28 @@
/** TRACCC library, part of the ACTS project (R&D line)
*
* (c) 2023 CERN for the benefit of the ACTS project
* (c) 2023-2024 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/

#pragma once

// Project include(s).
#include "traccc/definitions/qualifiers.hpp"

namespace traccc::cuda {

struct barrier {
TRACCC_DEVICE
void blockBarrier() { __syncthreads(); }
__device__ inline void blockBarrier() const { __syncthreads(); }

TRACCC_DEVICE
bool blockAnd(bool predicate) { return __syncthreads_and(predicate); }
__device__ inline bool blockAnd(bool predicate) const {
return __syncthreads_and(predicate);
}

TRACCC_DEVICE
bool blockOr(bool predicate) { return __syncthreads_or(predicate); }
__device__ inline bool blockOr(bool predicate) const {
return __syncthreads_or(predicate);
}

TRACCC_DEVICE
int blockCount(bool predicate) { return __syncthreads_count(predicate); }
__device__ inline int blockCount(bool predicate) const {
return __syncthreads_count(predicate);
}
Comment on lines -17 to +25
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, why? The previous version felt more consistent with the rest of the code.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why should we use a macro in a file that can only be used in CUDA device code? Much like how I removed TRACCC_DEVICE from the SYCL code (as it served really no purpose there), I thought this would also be a good simplification.

The macros are really only needed in the shared code. CUDA specific code should be free to use __device__, __global__, etc.

};

} // namespace traccc::cuda
43 changes: 23 additions & 20 deletions device/sycl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,44 +13,47 @@ enable_language( SYCL )

# Set up the build of the traccc::sycl library.
traccc_add_library( traccc_sycl sycl TYPE SHARED
# Spacepoint formation algorithm.
# Clusterization algorithm(s).
"include/traccc/sycl/clusterization/clusterization_algorithm.hpp"
"src/clusterization/clusterization_algorithm.sycl"
# Seeding algorithm(s).
"include/traccc/sycl/seeding/silicon_pixel_spacepoint_formation_algorithm.hpp"
"src/seeding/silicon_pixel_spacepoint_formation_algorithm.cpp"
"src/seeding/silicon_pixel_spacepoint_formation_algorithm_default_detector.sycl"
"src/seeding/silicon_pixel_spacepoint_formation_algorithm_telescope_detector.sycl"
"src/seeding/silicon_pixel_spacepoint_formation.hpp"
# Track fitting algorithm.
"include/traccc/sycl/seeding/spacepoint_binning.hpp"
"src/seeding/spacepoint_binning.sycl"
"include/traccc/sycl/seeding/seed_finding.hpp"
"src/seeding/seed_finding.sycl"
"include/traccc/sycl/seeding/seeding_algorithm.hpp"
"src/seeding/seeding_algorithm.cpp"
"include/traccc/sycl/seeding/track_params_estimation.hpp"
"src/seeding/track_params_estimation.sycl"
# Track fitting algorithm(s).
"include/traccc/sycl/fitting/kalman_fitting_algorithm.hpp"
"src/fitting/kalman_fitting_algorithm.cpp"
"src/fitting/kalman_fitting_algorithm_constant_field_default_detector.sycl"
"src/fitting/kalman_fitting_algorithm_constant_field_telescope_detector.sycl"
"src/fitting/fit_tracks.hpp"
# header files
"include/traccc/sycl/seeding/seeding_algorithm.hpp"
"include/traccc/sycl/seeding/seed_finding.hpp"
"include/traccc/sycl/seeding/spacepoint_binning.hpp"
"include/traccc/sycl/seeding/track_params_estimation.hpp"
"include/traccc/sycl/utils/queue_wrapper.hpp"
"include/traccc/sycl/utils/calculate1DimNdRange.hpp"
# Utilities.
"include/traccc/sycl/utils/make_prefix_sum_buff.hpp"
# implementation files
"src/clusterization/clusterization_algorithm.sycl"
"src/seeding/seed_finding.sycl"
"src/seeding/seeding_algorithm.cpp"
"src/seeding/spacepoint_binning.sycl"
"src/seeding/track_params_estimation.sycl"
"src/utils/make_prefix_sum_buff.sycl"
"include/traccc/sycl/utils/queue_wrapper.hpp"
"src/utils/queue_wrapper.cpp"
"src/utils/barrier.hpp"
"src/utils/calculate1DimNdRange.hpp"
"src/utils/calculate1DimNdRange.sycl"
"src/utils/get_queue.hpp"
"src/utils/get_queue.sycl"
"src/utils/queue_wrapper.cpp"
"src/utils/calculate1DimNdRange.sycl"
"src/utils/make_prefix_sum_buff.sycl"
"src/utils/thread_id.hpp"
"src/sanity/contiguous_on.hpp"
"src/sanity/ordered_on.hpp"
)
target_link_libraries( traccc_sycl
PUBLIC traccc::core detray::core vecmem::core covfie::core
PRIVATE traccc::device_common vecmem::sycl oneDPL )
PUBLIC detray::core vecmem::core covfie::core
traccc::core traccc::device_common
PRIVATE vecmem::sycl oneDPL )

# Set up Thrust specifically for the traccc::sycl library.
thrust_create_target( traccc::sycl_thrust
Expand Down
48 changes: 0 additions & 48 deletions device/sycl/include/traccc/sycl/utils/thread_id.hpp

This file was deleted.

15 changes: 8 additions & 7 deletions device/sycl/src/clusterization/clusterization_algorithm.sycl
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
#include "../sanity/ordered_on.hpp"
#include "../utils/barrier.hpp"
#include "../utils/get_queue.hpp"
#include "../utils/thread_id.hpp"
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
#include "traccc/sycl/clusterization/clusterization_algorithm.hpp"
#include "traccc/sycl/utils/thread_id.hpp"
#include "traccc/utils/projections.hpp"
#include "traccc/utils/relations.hpp"

Expand Down Expand Up @@ -77,10 +77,11 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
}

assert(is_contiguous_on<edm::silicon_cell_collection::const_device>(
cell_module_projection(), m_mr.main, m_copy, m_queue, cells_view));
cell_module_projection(), m_mr.main, m_copy,
details::get_queue(m_queue), cells_view));
assert(is_ordered_on<edm::silicon_cell_collection::const_device>(
channel0_major_cell_order_relation(), m_mr.main, m_copy, m_queue,
cells_view));
channel0_major_cell_order_relation(), m_mr.main, m_copy,
details::get_queue(m_queue), cells_view));

std::size_t num_blocks =
(num_cells + m_config.target_partition_size() - 1) /
Expand Down Expand Up @@ -153,15 +154,15 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
*mutex_ptr);

// Barrier used in the algorithm.
traccc::sycl::barrier barry_r(item);
const sycl::thread_id1 thread_id(item);
const details::barrier barrier{item};
const details::thread_id thread_id{item};

// Run the algorithm for this thread.
device::ccl_kernel(cfg, thread_id, cells_view, det_descr,
partition_start, partition_end, outi,
f_view, gf_view, f_backup_view,
gf_backup_view, adjc_backup_view,
adjv_backup_view, backup_mutex, barry_r,
adjv_backup_view, backup_mutex, barrier,
measurements_view, cell_links_view);
});
})
Expand Down
2 changes: 1 addition & 1 deletion device/sycl/src/fitting/fit_tracks.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#pragma once

// Local include(s).
#include "traccc/sycl/utils/calculate1DimNdRange.hpp"
#include "../utils/calculate1DimNdRange.hpp"

// Project include(s).
#include "traccc/edm/device/sort_key.hpp"
Expand Down
9 changes: 1 addition & 8 deletions device/sycl/src/sanity/contiguous_on.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,6 @@

#pragma once

// Project include(s).
#include "../utils/get_queue.hpp"
#include "traccc/sycl/utils/queue_wrapper.hpp"

// VecMem include(s).
#include <vecmem/memory/device_atomic_ref.hpp>
#include <vecmem/memory/memory_resource.hpp>
Expand Down Expand Up @@ -103,16 +99,13 @@ template <typename CONTAINER, std::semiregular P, typename VIEW>
requires std::regular_invocable<P,
decltype(std::declval<CONTAINER>().at(0))> bool
is_contiguous_on(P&& projection, vecmem::memory_resource& mr,
vecmem::copy& copy, queue_wrapper& queue_wrapper,
const VIEW& view) {
vecmem::copy& copy, ::sycl::queue& queue, const VIEW& view) {

// This should never be a performance-critical step, so we can keep the
// block size fixed.
constexpr int local_size = 512;
constexpr int local_size_2d = 32;

::sycl::queue& queue = details::get_queue(queue_wrapper);

// Grab the number of elements in our vector.
const typename VIEW::size_type n = copy.get_size(view);

Expand Down
4 changes: 1 addition & 3 deletions device/sycl/src/sanity/ordered_on.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,12 @@ template <typename CONTAINER, std::semiregular R, typename VIEW>
requires std::regular_invocable<R, decltype(std::declval<CONTAINER>().at(0)),
decltype(std::declval<CONTAINER>().at(0))> bool
is_ordered_on(R&& relation, vecmem::memory_resource& mr, vecmem::copy& copy,
queue_wrapper& queue_wrapper, const VIEW& view) {
::sycl::queue& queue, const VIEW& view) {

// This should never be a performance-critical step, so we can keep the
// block size fixed.
constexpr int block_size = 512;

::sycl::queue& queue = details::get_queue(queue_wrapper);

// Grab the number of elements in our container.
const typename VIEW::size_type n = copy.get_size(view);

Expand Down
Loading
Loading