forked from acts-project/traccc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinding_algorithm.cu
640 lines (517 loc) · 26.8 KB
/
finding_algorithm.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
/** TRACCC library, part of the ACTS project (R&D line)
*
* (c) 2023-2024 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/
// Project include(s).
#include "../utils/cuda_error_handling.hpp"
#include "../utils/utils.hpp"
#include "traccc/cuda/finding/finding_algorithm.hpp"
#include "traccc/definitions/primitives.hpp"
#include "traccc/edm/device/finding_global_counter.hpp"
#include "traccc/finding/candidate_link.hpp"
#include "traccc/finding/device/add_links_for_holes.hpp"
#include "traccc/finding/device/apply_interaction.hpp"
#include "traccc/finding/device/build_tracks.hpp"
#include "traccc/finding/device/count_measurements.hpp"
#include "traccc/finding/device/find_tracks.hpp"
#include "traccc/finding/device/make_barcode_sequence.hpp"
#include "traccc/finding/device/propagate_to_next_surface.hpp"
#include "traccc/finding/device/prune_tracks.hpp"
// detray include(s).
#include "detray/core/detector.hpp"
#include "detray/core/detector_metadata.hpp"
#include "detray/detectors/bfield.hpp"
#include "detray/navigation/navigator.hpp"
#include "detray/propagator/rk_stepper.hpp"
// VecMem include(s).
#include <vecmem/containers/data/vector_buffer.hpp>
#include <vecmem/containers/device_vector.hpp>
#include <vecmem/containers/jagged_device_vector.hpp>
#include <vecmem/containers/vector.hpp>
// Thrust include(s).
#include <thrust/copy.h>
#include <thrust/execution_policy.h>
#include <thrust/fill.h>
#include <thrust/scan.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
// System include(s).
#include <vector>
namespace traccc::cuda {
namespace kernels {
/// CUDA kernel for running @c traccc::device::make_barcode_sequence
__global__ void make_barcode_sequence(
measurement_collection_types::const_view measurements_view,
vecmem::data::vector_view<detray::geometry::barcode> barcodes_view) {
int gid = threadIdx.x + blockIdx.x * blockDim.x;
device::make_barcode_sequence(gid, measurements_view, barcodes_view);
}
/// CUDA kernel for running @c traccc::device::apply_interaction
template <typename detector_t>
__global__ void apply_interaction(
typename detector_t::view_type det_data, const int n_params,
bound_track_parameters_collection_types::view params_view) {
int gid = threadIdx.x + blockIdx.x * blockDim.x;
device::apply_interaction<detector_t>(gid, det_data, n_params, params_view);
}
/// CUDA kernel for running @c traccc::device::count_measurements
__global__ void count_measurements(
bound_track_parameters_collection_types::const_view params_view,
vecmem::data::vector_view<const detray::geometry::barcode> barcodes_view,
vecmem::data::vector_view<const unsigned int> upper_bounds_view,
const unsigned int n_in_params,
vecmem::data::vector_view<unsigned int> n_measurements_view,
vecmem::data::vector_view<unsigned int> ref_meas_idx_view,
unsigned int& n_measurements_sum) {
int gid = threadIdx.x + blockIdx.x * blockDim.x;
device::count_measurements(
gid, params_view, barcodes_view, upper_bounds_view, n_in_params,
n_measurements_view, ref_meas_idx_view, n_measurements_sum);
}
/// CUDA kernel for running @c traccc::device::find_tracks
template <typename detector_t, typename config_t>
__global__ void find_tracks(
const config_t cfg, typename detector_t::view_type det_data,
measurement_collection_types::const_view measurements_view,
bound_track_parameters_collection_types::const_view in_params_view,
vecmem::data::vector_view<const unsigned int>
n_measurements_prefix_sum_view,
vecmem::data::vector_view<const unsigned int> ref_meas_idx_view,
vecmem::data::vector_view<const candidate_link> prev_links_view,
vecmem::data::vector_view<const unsigned int> prev_param_to_link_view,
const unsigned int step, const unsigned int n_max_candidates,
bound_track_parameters_collection_types::view out_params_view,
vecmem::data::vector_view<unsigned int> n_candidates_view,
vecmem::data::vector_view<candidate_link> links_view,
unsigned int& n_candidates) {
int gid = threadIdx.x + blockIdx.x * blockDim.x;
device::find_tracks<detector_t, config_t>(
gid, cfg, det_data, measurements_view, in_params_view,
n_measurements_prefix_sum_view, ref_meas_idx_view, prev_links_view,
prev_param_to_link_view, step, n_max_candidates, out_params_view,
n_candidates_view, links_view, n_candidates);
}
/// CUDA kernel for running @c traccc::device::add_links_for_holes
__global__ void add_links_for_holes(
vecmem::data::vector_view<const unsigned int> n_candidates_view,
bound_track_parameters_collection_types::const_view in_params_view,
vecmem::data::vector_view<const candidate_link> prev_links_view,
vecmem::data::vector_view<const unsigned int> prev_param_to_link_view,
const unsigned int step, const unsigned int n_max_candidates,
bound_track_parameters_collection_types::view out_params_view,
vecmem::data::vector_view<candidate_link> links_view,
unsigned int& n_total_candidates) {
int gid = threadIdx.x + blockIdx.x * blockDim.x;
device::add_links_for_holes(gid, n_candidates_view, in_params_view,
prev_links_view, prev_param_to_link_view, step,
n_max_candidates, out_params_view, links_view,
n_total_candidates);
}
/// CUDA kernel for running @c traccc::device::propagate_to_next_surface
template <typename propagator_t, typename bfield_t, typename config_t>
__global__ void propagate_to_next_surface(
const config_t cfg,
typename propagator_t::detector_type::view_type det_data,
bfield_t field_data,
vecmem::data::jagged_vector_view<typename propagator_t::intersection_type>
nav_candidates_buffer,
bound_track_parameters_collection_types::const_view in_params_view,
vecmem::data::vector_view<const candidate_link> links_view,
const unsigned int step, const unsigned int& n_candidates,
bound_track_parameters_collection_types::view out_params_view,
vecmem::data::vector_view<unsigned int> param_to_link_view,
vecmem::data::vector_view<typename candidate_link::link_index_type>
tips_view,
vecmem::data::vector_view<unsigned int> n_tracks_per_seed_view,
unsigned int& n_out_params) {
int gid = threadIdx.x + blockIdx.x * blockDim.x;
device::propagate_to_next_surface<propagator_t, bfield_t, config_t>(
gid, cfg, det_data, field_data, nav_candidates_buffer, in_params_view,
links_view, step, n_candidates, out_params_view, param_to_link_view,
tips_view, n_tracks_per_seed_view, n_out_params);
}
/// CUDA kernel for running @c traccc::device::build_tracks
template <typename config_t>
__global__ void build_tracks(
const config_t cfg,
measurement_collection_types::const_view measurements_view,
bound_track_parameters_collection_types::const_view seeds_view,
vecmem::data::jagged_vector_view<const candidate_link> links_view,
vecmem::data::jagged_vector_view<const unsigned int> param_to_link_view,
vecmem::data::vector_view<const typename candidate_link::link_index_type>
tips_view,
track_candidate_container_types::view track_candidates_view,
vecmem::data::vector_view<unsigned int> valid_indices_view,
unsigned int& n_valid_tracks) {
int gid = threadIdx.x + blockIdx.x * blockDim.x;
device::build_tracks(gid, cfg, measurements_view, seeds_view, links_view,
param_to_link_view, tips_view, track_candidates_view,
valid_indices_view, n_valid_tracks);
}
/// CUDA kernel for running @c traccc::device::prune_tracks
__global__ void prune_tracks(
track_candidate_container_types::const_view track_candidates_view,
vecmem::data::vector_view<const unsigned int> valid_indices_view,
track_candidate_container_types::view prune_candidates_view) {
int gid = threadIdx.x + blockIdx.x * blockDim.x;
device::prune_tracks(gid, track_candidates_view, valid_indices_view,
prune_candidates_view);
}
} // namespace kernels
template <typename stepper_t, typename navigator_t>
finding_algorithm<stepper_t, navigator_t>::finding_algorithm(
const config_type& cfg, const traccc::memory_resource& mr,
vecmem::copy& copy, stream& str)
: m_cfg(cfg),
m_mr(mr),
m_copy(copy),
m_stream(str),
m_warp_size(details::get_warp_size(str.device())) {}
template <typename stepper_t, typename navigator_t>
track_candidate_container_types::buffer
finding_algorithm<stepper_t, navigator_t>::operator()(
const typename detector_type::view_type& det_view,
const bfield_type& field_view,
const vecmem::data::jagged_vector_view<
typename navigator_t::intersection_type>& navigation_buffer,
const typename measurement_collection_types::view& measurements,
const bound_track_parameters_collection_types::buffer& seeds_buffer) const {
// Get a convenience variable for the stream that we'll be using.
cudaStream_t stream = details::get_stream(m_stream);
// Copy setup
m_copy.setup(seeds_buffer)->ignore();
m_copy.setup(navigation_buffer)->ignore();
const unsigned int n_seeds = m_copy.get_size(seeds_buffer);
// Prepare input parameters with seeds
bound_track_parameters_collection_types::buffer in_params_buffer(n_seeds,
m_mr.main);
bound_track_parameters_collection_types::device in_params(in_params_buffer);
bound_track_parameters_collection_types::const_device seeds(seeds_buffer);
thrust::copy(thrust::cuda::par.on(stream), seeds.begin(), seeds.end(),
in_params.begin());
// Number of tracks per seed
vecmem::data::vector_buffer<unsigned int> n_tracks_per_seed_buffer(
n_seeds, m_mr.main);
// Create a map for links
std::map<unsigned int, vecmem::data::vector_buffer<candidate_link>>
link_map;
// Create a map for parameter ID to link ID
std::map<unsigned int, vecmem::data::vector_buffer<unsigned int>>
param_to_link_map;
// Create a map for tip links
std::map<unsigned int, vecmem::data::vector_buffer<
typename candidate_link::link_index_type>>
tips_map;
// Link size
std::vector<std::size_t> n_candidates_per_step;
n_candidates_per_step.reserve(m_cfg.max_track_candidates_per_track);
std::vector<std::size_t> n_parameters_per_step;
n_parameters_per_step.reserve(m_cfg.max_track_candidates_per_track);
// Global counter object in Device memory
vecmem::unique_alloc_ptr<device::finding_global_counter>
global_counter_device =
vecmem::make_unique_alloc<device::finding_global_counter>(
m_mr.main);
// Global counter object in Host memory
device::finding_global_counter global_counter_host;
/*****************************************************************
* Measurement Operations
*****************************************************************/
measurement_collection_types::const_view::size_type n_measurements =
m_copy.get_size(measurements);
// Get copy of barcode uniques
measurement_collection_types::buffer uniques_buffer{n_measurements,
m_mr.main};
measurement_collection_types::device uniques(uniques_buffer);
measurement* end =
thrust::unique_copy(thrust::cuda::par.on(stream), measurements.ptr(),
measurements.ptr() + n_measurements,
uniques.begin(), measurement_equal_comp());
unsigned int n_modules = end - uniques.begin();
// Get upper bounds of unique elements
vecmem::data::vector_buffer<unsigned int> upper_bounds_buffer{n_modules,
m_mr.main};
vecmem::device_vector<unsigned int> upper_bounds(upper_bounds_buffer);
thrust::upper_bound(thrust::cuda::par.on(stream), measurements.ptr(),
measurements.ptr() + n_measurements, uniques.begin(),
uniques.begin() + n_modules, upper_bounds.begin(),
measurement_sort_comp());
/*****************************************************************
* Kernel1: Create barcode sequence
*****************************************************************/
vecmem::data::vector_buffer<detray::geometry::barcode> barcodes_buffer{
n_modules, m_mr.main};
unsigned int nThreads = m_warp_size * 2;
unsigned int nBlocks = (barcodes_buffer.size() + nThreads - 1) / nThreads;
kernels::make_barcode_sequence<<<nBlocks, nThreads, 0, stream>>>(
uniques_buffer, barcodes_buffer);
TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());
for (unsigned int step = 0; step < m_cfg.max_track_candidates_per_track;
step++) {
// Previous step
const unsigned int prev_step = (step == 0 ? 0 : step - 1);
// Reset the number of tracks per seed
m_copy.memset(n_tracks_per_seed_buffer, 0)->ignore();
// Global counter object: Device -> Host
TRACCC_CUDA_ERROR_CHECK(
cudaMemcpyAsync(&global_counter_host, global_counter_device.get(),
sizeof(device::finding_global_counter),
cudaMemcpyDeviceToHost, stream));
m_stream.synchronize();
// Set the number of input parameters
const unsigned int n_in_params = (step == 0)
? in_params_buffer.size()
: global_counter_host.n_out_params;
// Terminate if there is no parameter to process.
if (n_in_params == 0) {
break;
}
// Reset the global counter
TRACCC_CUDA_ERROR_CHECK(
cudaMemsetAsync(global_counter_device.get(), 0,
sizeof(device::finding_global_counter), stream));
/*****************************************************************
* Kernel2: Apply material interaction
****************************************************************/
nThreads = m_warp_size * 2;
nBlocks = (n_in_params + nThreads - 1) / nThreads;
kernels::apply_interaction<detector_type>
<<<nBlocks, nThreads, 0, stream>>>(det_view, n_in_params,
in_params_buffer);
TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());
/*****************************************************************
* Kernel3: Count the number of measurements per parameter
****************************************************************/
vecmem::data::vector_buffer<unsigned int> n_measurements_buffer(
n_in_params, m_mr.main);
vecmem::device_vector<unsigned int> n_measurements_device(
n_measurements_buffer);
thrust::fill(thrust::cuda::par.on(stream),
n_measurements_device.begin(), n_measurements_device.end(),
0u);
// Create a buffer for the first measurement index of parameter
vecmem::data::vector_buffer<unsigned int> ref_meas_idx_buffer(
n_in_params, m_mr.main);
nThreads = m_warp_size * 2;
nBlocks = (n_in_params + nThreads - 1) / nThreads;
kernels::count_measurements<<<nBlocks, nThreads, 0, stream>>>(
in_params_buffer, barcodes_buffer, upper_bounds_buffer, n_in_params,
n_measurements_buffer, ref_meas_idx_buffer,
(*global_counter_device).n_measurements_sum);
TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());
// Global counter object: Device -> Host
TRACCC_CUDA_ERROR_CHECK(
cudaMemcpyAsync(&global_counter_host, global_counter_device.get(),
sizeof(device::finding_global_counter),
cudaMemcpyDeviceToHost, stream));
m_stream.synchronize();
// Create the buffer for the prefix sum of the number of measurements
// per parameter
vecmem::data::vector_buffer<unsigned int>
n_measurements_prefix_sum_buffer(n_in_params, m_mr.main);
vecmem::device_vector<unsigned int> n_measurements_prefix_sum(
n_measurements_prefix_sum_buffer);
thrust::inclusive_scan(
thrust::cuda::par.on(stream), n_measurements_device.begin(),
n_measurements_device.end(), n_measurements_prefix_sum.begin());
/*****************************************************************
* Kernel4: Find valid tracks
*****************************************************************/
// Buffer for kalman-updated parameters spawned by the measurement
// candidates
const unsigned int n_max_candidates =
n_in_params * m_cfg.max_num_branches_per_surface;
vecmem::data::vector_buffer<unsigned int> n_candidates_buffer{
n_in_params, m_mr.main};
vecmem::device_vector<unsigned int> n_candidates_device(
n_candidates_buffer);
thrust::fill(thrust::cuda::par.on(stream), n_candidates_device.begin(),
n_candidates_device.end(), 0u);
bound_track_parameters_collection_types::buffer updated_params_buffer(
n_in_params * m_cfg.max_num_branches_per_surface, m_mr.main);
// Create the link map
link_map[step] = {n_in_params * m_cfg.max_num_branches_per_surface,
m_mr.main};
m_copy.setup(link_map[step])->ignore();
nBlocks = (global_counter_host.n_measurements_sum +
nThreads * m_cfg.n_measurements_per_thread - 1) /
(nThreads * m_cfg.n_measurements_per_thread);
if (nBlocks > 0) {
kernels::find_tracks<detector_type, config_type>
<<<nBlocks, nThreads, 0, stream>>>(
m_cfg, det_view, measurements, in_params_buffer,
n_measurements_prefix_sum_buffer, ref_meas_idx_buffer,
link_map[prev_step], param_to_link_map[prev_step], step,
n_max_candidates, updated_params_buffer,
n_candidates_buffer, link_map[step],
(*global_counter_device).n_candidates);
TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());
}
/*****************************************************************
* Kernel5: Add a dummy links in case of no branches
*****************************************************************/
nBlocks = (n_in_params + nThreads - 1) / nThreads;
if (nBlocks > 0) {
kernels::add_links_for_holes<<<nBlocks, nThreads, 0, stream>>>(
n_candidates_buffer, in_params_buffer, link_map[prev_step],
param_to_link_map[prev_step], step, n_max_candidates,
updated_params_buffer, link_map[step],
(*global_counter_device).n_candidates);
TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());
}
// Global counter object: Device -> Host
TRACCC_CUDA_ERROR_CHECK(
cudaMemcpyAsync(&global_counter_host, global_counter_device.get(),
sizeof(device::finding_global_counter),
cudaMemcpyDeviceToHost, stream));
m_stream.synchronize();
/*****************************************************************
* Kernel6: Propagate to the next surface
*****************************************************************/
// Buffer for out parameters for the next step
bound_track_parameters_collection_types::buffer out_params_buffer(
global_counter_host.n_candidates, m_mr.main);
// Create the param to link ID map
param_to_link_map[step] = {global_counter_host.n_candidates, m_mr.main};
m_copy.setup(param_to_link_map[step])->ignore();
// Create the tip map
tips_map[step] = {global_counter_host.n_candidates, m_mr.main,
vecmem::data::buffer_type::resizable};
m_copy.setup(tips_map[step])->ignore();
nThreads = m_warp_size * 2;
if (global_counter_host.n_candidates > 0) {
nBlocks =
(global_counter_host.n_candidates + nThreads - 1) / nThreads;
kernels::propagate_to_next_surface<propagator_type, bfield_type,
config_type>
<<<nBlocks, nThreads, 0, stream>>>(
m_cfg, det_view, field_view, navigation_buffer,
updated_params_buffer, link_map[step], step,
(*global_counter_device).n_candidates, out_params_buffer,
param_to_link_map[step], tips_map[step],
n_tracks_per_seed_buffer,
(*global_counter_device).n_out_params);
TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());
}
TRACCC_CUDA_ERROR_CHECK(
cudaMemcpyAsync(&global_counter_host, global_counter_device.get(),
sizeof(device::finding_global_counter),
cudaMemcpyDeviceToHost, stream));
m_stream.synchronize();
// Fill the candidate size vector
n_candidates_per_step.push_back(global_counter_host.n_candidates);
n_parameters_per_step.push_back(global_counter_host.n_out_params);
// Swap parameter buffer for the next step
in_params_buffer = std::move(out_params_buffer);
}
// Create link buffer
vecmem::data::jagged_vector_buffer<candidate_link> links_buffer(
n_candidates_per_step, m_mr.main, m_mr.host);
m_copy.setup(links_buffer)->ignore();
// Copy link map to link buffer
const auto n_steps = n_candidates_per_step.size();
for (unsigned int it = 0; it < n_steps; it++) {
vecmem::device_vector<candidate_link> in(link_map[it]);
vecmem::device_vector<candidate_link> out(
*(links_buffer.host_ptr() + it));
thrust::copy(thrust::cuda::par.on(stream), in.begin(),
in.begin() + n_candidates_per_step[it], out.begin());
}
// Create param_to_link
vecmem::data::jagged_vector_buffer<unsigned int> param_to_link_buffer(
n_parameters_per_step, m_mr.main, m_mr.host);
m_copy.setup(param_to_link_buffer)->ignore();
// Copy param_to_link map to param_to_link buffer
for (unsigned int it = 0; it < n_steps; it++) {
vecmem::device_vector<unsigned int> in(param_to_link_map[it]);
vecmem::device_vector<unsigned int> out(
*(param_to_link_buffer.host_ptr() + it));
thrust::copy(thrust::cuda::par.on(stream), in.begin(),
in.begin() + n_parameters_per_step[it], out.begin());
}
// Get the number of tips per step
std::vector<unsigned int> n_tips_per_step;
n_tips_per_step.reserve(n_steps);
for (unsigned int it = 0; it < n_steps; it++) {
n_tips_per_step.push_back(m_copy.get_size(tips_map[it]));
}
// Copy tips_map into the tips vector (D->D)
unsigned int n_tips_total =
std::accumulate(n_tips_per_step.begin(), n_tips_per_step.end(), 0);
vecmem::data::vector_buffer<typename candidate_link::link_index_type>
tips_buffer{n_tips_total, m_mr.main};
m_copy.setup(tips_buffer)->ignore();
vecmem::device_vector<typename candidate_link::link_index_type> tips(
tips_buffer);
unsigned int prefix_sum = 0;
for (unsigned int it = 0; it < n_steps; it++) {
vecmem::device_vector<typename candidate_link::link_index_type> in(
tips_map[it]);
const unsigned int n_tips = n_tips_per_step[it];
if (n_tips > 0) {
thrust::copy(thrust::cuda::par.on(stream), in.begin(),
in.begin() + n_tips, tips.begin() + prefix_sum);
prefix_sum += n_tips;
}
}
/*****************************************************************
* Kernel7: Build tracks
*****************************************************************/
// Create track candidate buffer
track_candidate_container_types::buffer track_candidates_buffer{
{n_tips_total, m_mr.main},
{std::vector<std::size_t>(n_tips_total,
m_cfg.max_track_candidates_per_track),
m_mr.main, m_mr.host, vecmem::data::buffer_type::resizable}};
m_copy.setup(track_candidates_buffer.headers)->ignore();
m_copy.setup(track_candidates_buffer.items)->ignore();
// Create buffer for valid indices
vecmem::data::vector_buffer<unsigned int> valid_indices_buffer(n_tips_total,
m_mr.main);
// @Note: nBlocks can be zero in case there is no tip. This happens when
// chi2_max config is set tightly and no tips are found
if (n_tips_total > 0) {
nThreads = m_warp_size * 2;
nBlocks = (n_tips_total + nThreads - 1) / nThreads;
kernels::build_tracks<<<nBlocks, nThreads, 0, stream>>>(
m_cfg, measurements, seeds_buffer, links_buffer,
param_to_link_buffer, tips_buffer, track_candidates_buffer,
valid_indices_buffer, (*global_counter_device).n_valid_tracks);
TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());
}
// Global counter object: Device -> Host
TRACCC_CUDA_ERROR_CHECK(
cudaMemcpyAsync(&global_counter_host, global_counter_device.get(),
sizeof(device::finding_global_counter),
cudaMemcpyDeviceToHost, stream));
m_stream.synchronize();
// Create pruned candidate buffer
track_candidate_container_types::buffer prune_candidates_buffer{
{global_counter_host.n_valid_tracks, m_mr.main},
{std::vector<std::size_t>(global_counter_host.n_valid_tracks,
m_cfg.max_track_candidates_per_track),
m_mr.main, m_mr.host, vecmem::data::buffer_type::resizable}};
m_copy.setup(prune_candidates_buffer.headers)->ignore();
m_copy.setup(prune_candidates_buffer.items)->ignore();
if (global_counter_host.n_valid_tracks > 0) {
nThreads = m_warp_size * 2;
nBlocks =
(global_counter_host.n_valid_tracks + nThreads - 1) / nThreads;
kernels::prune_tracks<<<nBlocks, nThreads, 0, stream>>>(
track_candidates_buffer, valid_indices_buffer,
prune_candidates_buffer);
TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());
}
return prune_candidates_buffer;
}
// Explicit template instantiation
using default_detector_type =
detray::detector<detray::default_metadata, detray::device_container_types>;
using default_stepper_type =
detray::rk_stepper<covfie::field<detray::bfield::const_bknd_t>::view_t,
traccc::default_algebra, detray::constrained_step<>>;
using default_navigator_type = detray::navigator<const default_detector_type>;
template class finding_algorithm<default_stepper_type, default_navigator_type>;
} // namespace traccc::cuda