9
9
10
10
#include " traccc/clusterization/device/aggregate_cluster.hpp"
11
11
#include " traccc/clusterization/device/reduce_problem_cell.hpp"
12
+ #include " vecmem/memory/device_atomic_ref.hpp"
12
13
13
14
namespace traccc ::device {
14
15
@@ -33,13 +34,13 @@ namespace traccc::device {
33
34
// / @param[in] barrier A generic object for block-wide synchronisation
34
35
// /
35
36
template <typename barrier_t >
36
- TRACCC_DEVICE void fast_sv_1 (
37
- vecmem::device_vector<details::index_t >& f ,
38
- vecmem::device_vector< details::index_t >& gf ,
39
- unsigned char adjc[ details::MAX_CELLS_PER_THREAD] ,
40
- details::index_t adjv[details::MAX_CELLS_PER_THREAD][ 8 ] ,
41
- const details:: index_t tid, const details::index_t blckDim,
42
- barrier_t & barrier) {
37
+ TRACCC_DEVICE void fast_sv_1 (vecmem::device_vector<details:: index_t >& f,
38
+ vecmem::device_vector<details::index_t >& gf ,
39
+ unsigned char * adjc, details::index_t * adjv ,
40
+ details::index_t thread_cell_count ,
41
+ const details::index_t tid ,
42
+ const details::index_t blckDim,
43
+ barrier_t & barrier) {
43
44
/*
44
45
* The algorithm finishes if an iteration leaves the arrays unchanged.
45
46
* This varible will be set if a change is made, and dictates if another
@@ -61,13 +62,12 @@ TRACCC_DEVICE void fast_sv_1(
61
62
* cluster ID if it is lower than ours, essentially merging the two
62
63
* together.
63
64
*/
64
- for (details::index_t tst = 0 ; tst < details::MAX_CELLS_PER_THREAD;
65
- ++tst) {
65
+ for (details::index_t tst = 0 ; tst < thread_cell_count; ++tst) {
66
66
const details::index_t cid = tst * blckDim + tid;
67
67
68
68
__builtin_assume (adjc[tst] <= 8 );
69
69
for (unsigned char k = 0 ; k < adjc[tst]; ++k) {
70
- details::index_t q = gf.at (adjv[tst][ k]);
70
+ details::index_t q = gf.at (adjv[8 * tst + k]);
71
71
72
72
if (gf.at (cid) > q) {
73
73
f.at (f.at (cid)) = q;
@@ -82,9 +82,7 @@ TRACCC_DEVICE void fast_sv_1(
82
82
*/
83
83
barrier.blockBarrier ();
84
84
85
- #pragma unroll
86
- for (details::index_t tst = 0 ; tst < details::MAX_CELLS_PER_THREAD;
87
- ++tst) {
85
+ for (details::index_t tst = 0 ; tst < thread_cell_count; ++tst) {
88
86
const details::index_t cid = tst * blckDim + tid;
89
87
/*
90
88
* The second stage is shortcutting, which is an optimisation that
@@ -101,9 +99,7 @@ TRACCC_DEVICE void fast_sv_1(
101
99
*/
102
100
barrier.blockBarrier ();
103
101
104
- #pragma unroll
105
- for (details::index_t tst = 0 ; tst < details::MAX_CELLS_PER_THREAD;
106
- ++tst) {
102
+ for (details::index_t tst = 0 ; tst < thread_cell_count; ++tst) {
107
103
const details::index_t cid = tst * blckDim + tid;
108
104
/*
109
105
* Update the array for the next generation, keeping track of any
@@ -135,17 +131,24 @@ TRACCC_DEVICE inline void ccl_kernel(
135
131
const details::index_t target_cells_per_partition,
136
132
unsigned int & partition_start, unsigned int & partition_end,
137
133
unsigned int & outi, vecmem::data::vector_view<details::index_t > f_view,
138
- vecmem::data::vector_view<details::index_t > gf_view, barrier_t & barrier,
134
+ vecmem::data::vector_view<details::index_t > gf_view,
135
+ vecmem::data::vector_view<details::index_t > f_backup_view,
136
+ vecmem::data::vector_view<details::index_t > gf_backup_view,
137
+ vecmem::data::vector_view<unsigned char > adjc_backup_view,
138
+ vecmem::data::vector_view<details::index_t > adjv_backup_view,
139
+ vecmem::device_atomic_ref<uint32_t > backup_mutex, barrier_t & barrier,
139
140
measurement_collection_types::view measurements_view,
140
141
vecmem::data::vector_view<unsigned int > cell_links) {
141
-
142
142
// Construct device containers around the views.
143
143
const cell_collection_types::const_device cells_device (cells_view);
144
144
const cell_module_collection_types::const_device modules_device (
145
145
modules_view);
146
146
measurement_collection_types::device measurements_device (measurements_view);
147
147
vecmem::device_vector<details::index_t > f (f_view);
148
148
vecmem::device_vector<details::index_t > gf (gf_view);
149
+ vecmem::device_vector<unsigned char > adjc_backup (adjc_backup_view);
150
+ vecmem::device_vector<details::index_t > adjv_backup (adjv_backup_view);
151
+ bool using_backup_memory = false ;
149
152
150
153
const cell_collection_types::const_device::size_type num_cells =
151
154
cells_device.size ();
@@ -199,41 +202,71 @@ TRACCC_DEVICE inline void ccl_kernel(
199
202
barrier.blockBarrier ();
200
203
201
204
// Vector of indices of the adjacent cells
202
- details::index_t adjv[details::MAX_CELLS_PER_THREAD][8 ];
205
+ details::index_t _adjv[details::MAX_CELLS_PER_THREAD * 8 ];
206
+ details::index_t * adjv = _adjv;
207
+
203
208
/*
204
209
* The number of adjacent cells for each cell must start at zero, to
205
210
* avoid uninitialized memory. adjv does not need to be zeroed, as
206
211
* we will only access those values if adjc indicates that the value
207
212
* is set.
208
213
*/
209
214
// Number of adjacent cells
210
- unsigned char adjc[details::MAX_CELLS_PER_THREAD];
215
+ unsigned char _adjc[details::MAX_CELLS_PER_THREAD];
216
+ unsigned char * adjc = _adjc;
211
217
212
218
// It seems that sycl runs into undefined behaviour when calling
213
219
// group synchronisation functions when some threads have already run
214
220
// into a return. As such, we cannot use returns in this kernel.
215
221
216
222
// Get partition for this thread group
217
223
const details::index_t size = partition_end - partition_start;
218
- assert (size <= max_cells_per_partition);
219
224
220
- #pragma unroll
221
- for (details::index_t tst = 0 ; tst < details::MAX_CELLS_PER_THREAD; ++tst) {
225
+ // If our partition is too large, we need to handle this specific edge
226
+ // case. The first thread of the block will attempt to enter a critical
227
+ // section by obtaining a lock on a mutex in global memory. When this is
228
+ // obtained, we can use some memory in global memory instead of the shared
229
+ // memory. This can be done more efficiently, but this should be a very
230
+ // rare edge case.
231
+ if (size > max_cells_per_partition) {
232
+ if (threadId == 0 ) {
233
+ uint32_t false_int = 0 ;
234
+ while (backup_mutex.compare_exchange_strong (false_int, 1u )) {
235
+ }
236
+ }
237
+
238
+ barrier.blockBarrier ();
239
+
240
+ f = f_backup_view;
241
+ gf = gf_backup_view;
242
+ adjc = adjc_backup.data ();
243
+ adjv = adjv_backup.data ();
244
+ using_backup_memory = true ;
245
+ }
246
+
247
+ assert (size <= f.size ());
248
+ assert (size <= gf.size ());
249
+
250
+ details::index_t thread_cell_count = 0 ;
251
+ for (details::index_t cid;
252
+ (cid = thread_cell_count * blckDim + threadId) < size;
253
+ ++thread_cell_count) {
254
+ }
255
+
256
+ for (details::index_t tst = 0 ; tst < thread_cell_count; ++tst) {
222
257
adjc[tst] = 0 ;
223
258
}
224
259
225
- for (details::index_t tst = 0 , cid; (cid = tst * blckDim + threadId) < size;
226
- ++tst) {
260
+ for (details::index_t tst = 0 ; tst < thread_cell_count; ++tst) {
227
261
/*
228
262
* Look for adjacent cells to the current one.
229
263
*/
230
- assert (tst < details::MAX_CELLS_PER_THREAD) ;
264
+ const details::index_t cid = tst * blckDim + threadId ;
231
265
reduce_problem_cell (cells_device, cid, partition_start, partition_end,
232
- adjc[tst], adjv[tst]);
266
+ adjc[tst], & adjv[8 * tst]);
233
267
}
234
268
235
- #pragma unroll
236
- for (details::index_t tst = 0 ; tst < details::MAX_CELLS_PER_THREAD; ++tst) {
269
+ for (details::index_t tst = 0 ; tst < thread_cell_count; ++tst) {
237
270
const details::index_t cid = tst * blckDim + threadId;
238
271
/*
239
272
* At the start, the values of f and gf should be equal to the
@@ -253,12 +286,13 @@ TRACCC_DEVICE inline void ccl_kernel(
253
286
* Run FastSV algorithm, which will update the father index to that of
254
287
* the cell belonging to the same cluster with the lowest index.
255
288
*/
256
- fast_sv_1 (f, gf, adjc, adjv, threadId, blckDim, barrier);
289
+ fast_sv_1 (f, gf, &adjc[0 ], &adjv[0 ], thread_cell_count, threadId, blckDim,
290
+ barrier);
257
291
258
292
barrier.blockBarrier ();
259
293
260
- for (details::index_t tst = 0 , cid; (cid = tst * blckDim + threadId) < size;
261
- ++ tst) {
294
+ for (details::index_t tst = 0 ; tst < thread_cell_count; ++tst) {
295
+ const details:: index_t cid = tst * blckDim + threadId;
262
296
if (f.at (cid) == cid) {
263
297
// Add a new measurement to the output buffer. Remembering its
264
298
// position inside of the container.
@@ -271,6 +305,13 @@ TRACCC_DEVICE inline void ccl_kernel(
271
305
meas_pos);
272
306
}
273
307
}
308
+
309
+ // Recall that we might be holding a mutex on some global memory. If we
310
+ // are, make sure to release it here so that any future kernels trying to
311
+ // use that memory don't get stuck in a loop forever.
312
+ if (threadId == 0 && using_backup_memory) {
313
+ backup_mutex.store (0 );
314
+ }
274
315
}
275
316
276
317
} // namespace traccc::device
0 commit comments