Skip to content

Commit 57096f8

Browse files
committed
Improve robustness and performance of CCL
This commit partially addresses #567. In the past, the CCL kernel was unable to deal with extremely large partitions. Although this is very unlikely to happen, our ODD samples contain a few cases of partitions so large it crashes the code. This commit equips the CCL code with some scratch memory which it can reserve using a mutex. This allows it enough space to do its work in global memory. Although this is, of course, slower, it should happen very infrequently. Parameters can be tuned to determine that frequency. This commit also contains a few optimizations to the code which reduce the running time on a μ = 200 event from about 1100 microseconds to 700 microseconds on an RTX A5000.
1 parent 15baf6d commit 57096f8

21 files changed

+291
-102
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
/**
2+
* traccc library, part of the ACTS project (R&D line)
3+
*
4+
* (c) 2024 CERN for the benefit of the ACTS project
5+
*
6+
* Mozilla Public License Version 2.0
7+
*/
8+
9+
#pragma once
10+
11+
namespace traccc {
12+
struct clustering_config {
13+
clustering_config(unsigned int _target_cells_per_partition = 2048,
14+
unsigned int _target_cells_per_thread = 8,
15+
unsigned int _backup_partition_size = 256 * 4096)
16+
: target_cells_per_partition(_target_cells_per_partition),
17+
target_cells_per_thread(_target_cells_per_thread),
18+
backup_partition_size(_backup_partition_size) {}
19+
20+
unsigned int target_cells_per_partition;
21+
unsigned int target_cells_per_thread;
22+
unsigned int backup_partition_size;
23+
};
24+
} // namespace traccc

core/include/traccc/clusterization/clusterization_algorithm.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#pragma once
99

1010
// Library include(s).
11+
#include "traccc/clusterization/clustering_config.hpp"
1112
#include "traccc/clusterization/measurement_creation_algorithm.hpp"
1213
#include "traccc/clusterization/sparse_ccl_algorithm.hpp"
1314
#include "traccc/edm/cell.hpp"
@@ -33,6 +34,8 @@ class clusterization_algorithm
3334
const cell_module_collection_types::const_view&)> {
3435

3536
public:
37+
using config_type = clustering_config;
38+
3639
/// Clusterization algorithm constructor
3740
///
3841
/// @param mr The memory resource to use for the result objects

device/common/include/traccc/clusterization/device/ccl_kernel.hpp

+29-17
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#pragma once
99

1010
// Project include(s).
11+
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
1112
#include "traccc/definitions/qualifiers.hpp"
1213
#include "traccc/edm/cell.hpp"
1314
#include "traccc/edm/measurement.hpp"
@@ -23,33 +24,29 @@
2324
namespace traccc::device {
2425

2526
namespace details {
26-
27-
/// These indices in clusterization will only range from 0 to
28-
/// max_cells_per_partition, so we only need a short
29-
using index_t = unsigned short;
30-
31-
static constexpr int TARGET_CELLS_PER_THREAD = 8;
32-
static constexpr int MAX_CELLS_PER_THREAD = 32;
27+
static constexpr int MAX_CELLS_PER_THREAD = 16;
3328

3429
/// Helper struct for calculating some of the input parameters of @c ccl_kernel
3530
struct ccl_kernel_helper {
36-
3731
/// Constructor setting the helper parameters
3832
///
3933
/// @param[in] target_cells_per_partition Target average number of cells per
40-
/// thread block
34+
/// thread block
35+
/// @param[in] target_cells_per_thread Target average number of cells per
36+
/// thread
4137
/// @param[in] n_cells Total number of cells
42-
///
4338
ccl_kernel_helper(index_t target_cells_per_partition,
44-
unsigned int n_cells) {
45-
39+
index_t target_cells_per_thread, unsigned int n_cells) {
40+
/// Shared memory size
4641
max_cells_per_partition =
4742
(target_cells_per_partition * MAX_CELLS_PER_THREAD +
48-
TARGET_CELLS_PER_THREAD - 1) /
49-
TARGET_CELLS_PER_THREAD;
43+
target_cells_per_thread - 1) /
44+
target_cells_per_thread;
45+
/// Block size
5046
threads_per_partition =
51-
(target_cells_per_partition + TARGET_CELLS_PER_THREAD - 1) /
52-
TARGET_CELLS_PER_THREAD;
47+
(target_cells_per_partition + target_cells_per_thread - 1) /
48+
target_cells_per_thread;
49+
/// Grid size
5350
num_partitions = (n_cells + target_cells_per_partition - 1) /
5451
target_cells_per_partition;
5552
}
@@ -81,6 +78,16 @@ struct ccl_kernel_helper {
8178
/// @param f_view array of "parent" indices for all cells in this partition
8279
/// @param gf_view array of "grandparent" indices for all cells in this
8380
/// partition
81+
/// @param f_backup_view global memory alternative to `f_view` for cases in
82+
/// which that array is not large enough
83+
/// @param gf_backup_view global memory alternative to `gf_view` for cases in
84+
/// which that array is not large enough
85+
/// @param adjc_backup_view global memory alternative to the adjacent cell
86+
/// count vector
87+
/// @param adjv_backup_view global memory alternative to the cell adjacency
88+
/// matrix fragment storage
89+
/// @param backup_mutex mutex lock to mediate control over the backup global
90+
/// memory data structures.
8491
/// @param barrier A generic object for block-wide synchronisation
8592
/// @param[out] measurements_view collection of measurements
8693
/// @param[out] cell_links collection of links to measurements each cell is
@@ -94,7 +101,12 @@ TRACCC_DEVICE inline void ccl_kernel(
94101
const details::index_t target_cells_per_partition,
95102
unsigned int& partition_start, unsigned int& partition_end,
96103
unsigned int& outi, vecmem::data::vector_view<details::index_t> f_view,
97-
vecmem::data::vector_view<details::index_t> gf_view, barrier_t& barrier,
104+
vecmem::data::vector_view<details::index_t> gf_view,
105+
vecmem::data::vector_view<details::index_t> f_backup_view,
106+
vecmem::data::vector_view<details::index_t> gf_backup_view,
107+
vecmem::data::vector_view<unsigned char> adjc_backup_view,
108+
vecmem::data::vector_view<details::index_t> adjv_backup_view,
109+
vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
98110
measurement_collection_types::view measurements_view,
99111
vecmem::data::vector_view<unsigned int> cell_links);
100112

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
/**
2+
* traccc library, part of the ACTS project (R&D line)
3+
*
4+
* (c) 2024 CERN for the benefit of the ACTS project
5+
*
6+
* Mozilla Public License Version 2.0
7+
*/
8+
9+
#pragma once
10+
11+
namespace traccc::device::details {
12+
/// These indices in clusterization will only range from 0 to
13+
/// max_cells_per_partition, so we only need a short
14+
using index_t = unsigned short;
15+
} // namespace traccc::device::details

device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp

+73-32
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
#include "traccc/clusterization/device/aggregate_cluster.hpp"
1111
#include "traccc/clusterization/device/reduce_problem_cell.hpp"
12+
#include "vecmem/memory/device_atomic_ref.hpp"
1213

1314
namespace traccc::device {
1415

@@ -33,13 +34,13 @@ namespace traccc::device {
3334
/// @param[in] barrier A generic object for block-wide synchronisation
3435
///
3536
template <typename barrier_t>
36-
TRACCC_DEVICE void fast_sv_1(
37-
vecmem::device_vector<details::index_t>& f,
38-
vecmem::device_vector<details::index_t>& gf,
39-
unsigned char adjc[details::MAX_CELLS_PER_THREAD],
40-
details::index_t adjv[details::MAX_CELLS_PER_THREAD][8],
41-
const details::index_t tid, const details::index_t blckDim,
42-
barrier_t& barrier) {
37+
TRACCC_DEVICE void fast_sv_1(vecmem::device_vector<details::index_t>& f,
38+
vecmem::device_vector<details::index_t>& gf,
39+
unsigned char* adjc, details::index_t* adjv,
40+
details::index_t thread_cell_count,
41+
const details::index_t tid,
42+
const details::index_t blckDim,
43+
barrier_t& barrier) {
4344
/*
4445
* The algorithm finishes if an iteration leaves the arrays unchanged.
4546
* This varible will be set if a change is made, and dictates if another
@@ -61,13 +62,12 @@ TRACCC_DEVICE void fast_sv_1(
6162
* cluster ID if it is lower than ours, essentially merging the two
6263
* together.
6364
*/
64-
for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD;
65-
++tst) {
65+
for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
6666
const details::index_t cid = tst * blckDim + tid;
6767

6868
__builtin_assume(adjc[tst] <= 8);
6969
for (unsigned char k = 0; k < adjc[tst]; ++k) {
70-
details::index_t q = gf.at(adjv[tst][k]);
70+
details::index_t q = gf.at(adjv[8 * tst + k]);
7171

7272
if (gf.at(cid) > q) {
7373
f.at(f.at(cid)) = q;
@@ -82,9 +82,7 @@ TRACCC_DEVICE void fast_sv_1(
8282
*/
8383
barrier.blockBarrier();
8484

85-
#pragma unroll
86-
for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD;
87-
++tst) {
85+
for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
8886
const details::index_t cid = tst * blckDim + tid;
8987
/*
9088
* The second stage is shortcutting, which is an optimisation that
@@ -101,9 +99,7 @@ TRACCC_DEVICE void fast_sv_1(
10199
*/
102100
barrier.blockBarrier();
103101

104-
#pragma unroll
105-
for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD;
106-
++tst) {
102+
for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
107103
const details::index_t cid = tst * blckDim + tid;
108104
/*
109105
* Update the array for the next generation, keeping track of any
@@ -135,17 +131,24 @@ TRACCC_DEVICE inline void ccl_kernel(
135131
const details::index_t target_cells_per_partition,
136132
unsigned int& partition_start, unsigned int& partition_end,
137133
unsigned int& outi, vecmem::data::vector_view<details::index_t> f_view,
138-
vecmem::data::vector_view<details::index_t> gf_view, barrier_t& barrier,
134+
vecmem::data::vector_view<details::index_t> gf_view,
135+
vecmem::data::vector_view<details::index_t> f_backup_view,
136+
vecmem::data::vector_view<details::index_t> gf_backup_view,
137+
vecmem::data::vector_view<unsigned char> adjc_backup_view,
138+
vecmem::data::vector_view<details::index_t> adjv_backup_view,
139+
vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
139140
measurement_collection_types::view measurements_view,
140141
vecmem::data::vector_view<unsigned int> cell_links) {
141-
142142
// Construct device containers around the views.
143143
const cell_collection_types::const_device cells_device(cells_view);
144144
const cell_module_collection_types::const_device modules_device(
145145
modules_view);
146146
measurement_collection_types::device measurements_device(measurements_view);
147147
vecmem::device_vector<details::index_t> f(f_view);
148148
vecmem::device_vector<details::index_t> gf(gf_view);
149+
vecmem::device_vector<unsigned char> adjc_backup(adjc_backup_view);
150+
vecmem::device_vector<details::index_t> adjv_backup(adjv_backup_view);
151+
bool using_backup_memory = false;
149152

150153
const cell_collection_types::const_device::size_type num_cells =
151154
cells_device.size();
@@ -199,41 +202,71 @@ TRACCC_DEVICE inline void ccl_kernel(
199202
barrier.blockBarrier();
200203

201204
// Vector of indices of the adjacent cells
202-
details::index_t adjv[details::MAX_CELLS_PER_THREAD][8];
205+
details::index_t _adjv[details::MAX_CELLS_PER_THREAD * 8];
206+
details::index_t* adjv = _adjv;
207+
203208
/*
204209
* The number of adjacent cells for each cell must start at zero, to
205210
* avoid uninitialized memory. adjv does not need to be zeroed, as
206211
* we will only access those values if adjc indicates that the value
207212
* is set.
208213
*/
209214
// Number of adjacent cells
210-
unsigned char adjc[details::MAX_CELLS_PER_THREAD];
215+
unsigned char _adjc[details::MAX_CELLS_PER_THREAD];
216+
unsigned char* adjc = _adjc;
211217

212218
// It seems that sycl runs into undefined behaviour when calling
213219
// group synchronisation functions when some threads have already run
214220
// into a return. As such, we cannot use returns in this kernel.
215221

216222
// Get partition for this thread group
217223
const details::index_t size = partition_end - partition_start;
218-
assert(size <= max_cells_per_partition);
219224

220-
#pragma unroll
221-
for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD; ++tst) {
225+
// If our partition is too large, we need to handle this specific edge
226+
// case. The first thread of the block will attempt to enter a critical
227+
// section by obtaining a lock on a mutex in global memory. When this is
228+
// obtained, we can use some memory in global memory instead of the shared
229+
// memory. This can be done more efficiently, but this should be a very
230+
// rare edge case.
231+
if (size > max_cells_per_partition) {
232+
if (threadId == 0) {
233+
uint32_t false_int = 0;
234+
while (backup_mutex.compare_exchange_strong(false_int, 1u)) {
235+
}
236+
}
237+
238+
barrier.blockBarrier();
239+
240+
f = f_backup_view;
241+
gf = gf_backup_view;
242+
adjc = adjc_backup.data();
243+
adjv = adjv_backup.data();
244+
using_backup_memory = true;
245+
}
246+
247+
assert(size <= f.size());
248+
assert(size <= gf.size());
249+
250+
details::index_t thread_cell_count = 0;
251+
for (details::index_t cid;
252+
(cid = thread_cell_count * blckDim + threadId) < size;
253+
++thread_cell_count) {
254+
}
255+
256+
for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
222257
adjc[tst] = 0;
223258
}
224259

225-
for (details::index_t tst = 0, cid; (cid = tst * blckDim + threadId) < size;
226-
++tst) {
260+
for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
227261
/*
228262
* Look for adjacent cells to the current one.
229263
*/
230-
assert(tst < details::MAX_CELLS_PER_THREAD);
264+
const details::index_t cid = tst * blckDim + threadId;
231265
reduce_problem_cell(cells_device, cid, partition_start, partition_end,
232-
adjc[tst], adjv[tst]);
266+
adjc[tst], &adjv[8 * tst]);
233267
}
234268

235-
#pragma unroll
236-
for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD; ++tst) {
269+
for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
237270
const details::index_t cid = tst * blckDim + threadId;
238271
/*
239272
* At the start, the values of f and gf should be equal to the
@@ -253,12 +286,13 @@ TRACCC_DEVICE inline void ccl_kernel(
253286
* Run FastSV algorithm, which will update the father index to that of
254287
* the cell belonging to the same cluster with the lowest index.
255288
*/
256-
fast_sv_1(f, gf, adjc, adjv, threadId, blckDim, barrier);
289+
fast_sv_1(f, gf, &adjc[0], &adjv[0], thread_cell_count, threadId, blckDim,
290+
barrier);
257291

258292
barrier.blockBarrier();
259293

260-
for (details::index_t tst = 0, cid; (cid = tst * blckDim + threadId) < size;
261-
++tst) {
294+
for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
295+
const details::index_t cid = tst * blckDim + threadId;
262296
if (f.at(cid) == cid) {
263297
// Add a new measurement to the output buffer. Remembering its
264298
// position inside of the container.
@@ -271,6 +305,13 @@ TRACCC_DEVICE inline void ccl_kernel(
271305
meas_pos);
272306
}
273307
}
308+
309+
// Recall that we might be holding a mutex on some global memory. If we
310+
// are, make sure to release it here so that any future kernels trying to
311+
// use that memory don't get stuck in a loop forever.
312+
if (threadId == 0 && using_backup_memory) {
313+
backup_mutex.store(0);
314+
}
274315
}
275316

276317
} // namespace traccc::device

device/common/include/traccc/clusterization/device/impl/reduce_problem_cell.ipp

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ TRACCC_HOST_DEVICE
1919
inline void reduce_problem_cell(
2020
const cell_collection_types::const_device& cells, const unsigned short cid,
2121
const unsigned int start, const unsigned int end, unsigned char& adjc,
22-
unsigned short adjv[8]) {
22+
unsigned short* adjv) {
2323

2424
// Some sanity check(s).
2525
assert(start <= end);

device/common/include/traccc/clusterization/device/reduce_problem_cell.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ TRACCC_HOST_DEVICE
2828
inline void reduce_problem_cell(
2929
const cell_collection_types::const_device& cells, unsigned short cid,
3030
unsigned int start, unsigned int end, unsigned char& adjc,
31-
unsigned short adjv[8]);
31+
unsigned short* adjv);
3232

3333
} // namespace traccc::device
3434

0 commit comments

Comments
 (0)