10
10
#include "../sanity/ordered_on.hpp"
11
11
#include "../utils/barrier.hpp"
12
12
#include "../utils/get_queue.hpp"
13
+ #include "traccc/clusterization/device/ccl_debug_output.hpp"
13
14
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
14
15
#include "traccc/sycl/clusterization/clusterization_algorithm.hpp"
15
16
#include "traccc/sycl/utils/thread_id.hpp"
@@ -113,9 +114,28 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
113
114
assert(m_config.max_cells_per_thread <=
114
115
device::details::CELLS_PER_THREAD_STACK_LIMIT);
115
116
117
+ // If necessary, allocate an object for storing the debug information
118
+ vecmem::unique_alloc_ptr<device::details::ccl_debug_output> debug_output;
119
+ cl::sycl::event evt_copy_debug_output_h2d;
120
+
121
+ if (m_config.enable_debug_output) {
122
+ debug_output =
123
+ vecmem::make_unique_alloc<device::details::ccl_debug_output>(
124
+ m_mr.main);
125
+
126
+ device::details::ccl_debug_output empty_output =
127
+ device::details::ccl_debug_output::init();
128
+
129
+ evt_copy_debug_output_h2d = details::get_queue(m_queue).memcpy(
130
+ debug_output.get(), &empty_output,
131
+ sizeof(device::details::ccl_debug_output));
132
+ }
133
+
116
134
// Run ccl kernel
117
- details::get_queue(m_queue)
118
- .submit([&](::sycl::handler& h) {
135
+ cl::sycl::event evt_run_kernel =
136
+ details::get_queue(m_queue).submit([&](::sycl::handler& h) {
137
+ h.depends_on(evt_copy_debug_output_h2d);
138
+
119
139
// Allocate shared memory for the kernel.
120
140
vecmem::sycl::local_accessor<std::size_t> shared_uint(3, h);
121
141
vecmem::sycl::local_accessor<device::details::index_t> shared_idx(
@@ -130,8 +150,8 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
130
150
gf_backup_view = vecmem::get_data(m_gf_backup),
131
151
adjc_backup_view = vecmem::get_data(m_adjc_backup),
132
152
adjv_backup_view = vecmem::get_data(m_adjv_backup),
133
- mutex_ptr = m_backup_mutex.get(),
134
- cfg = m_config ](::sycl::nd_item<1> item) {
153
+ mutex_ptr = m_backup_mutex.get(), cfg = m_config,
154
+ debug_output = debug_output.get() ](::sycl::nd_item<1> item) {
135
155
// Construct more readable variable names.
136
156
vecmem::data::vector_view<device::details::index_t> f_view{
137
157
static_cast<vector_size_t>(cfg.max_partition_size()),
@@ -152,15 +172,38 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
152
172
const sycl::thread_id1 thread_id(item);
153
173
154
174
// Run the algorithm for this thread.
155
- device::ccl_kernel(cfg, thread_id, cells_view, modules_view,
156
- partition_start, partition_end, outi ,
157
- f_view, gf_view, f_backup_view ,
158
- gf_backup_view, adjc_backup_view,
159
- adjv_backup_view, backup_mutex, barry_r,
160
- measurements_view, cell_links_view);
175
+ device::ccl_kernel(
176
+ cfg, thread_id, cells_view, modules_view ,
177
+ partition_start, partition_end, outi, f_view, gf_view,
178
+ f_backup_view, gf_backup_view, adjc_backup_view,
179
+ adjv_backup_view, backup_mutex, barry_r,
180
+ measurements_view, cell_links_view, debug_output );
161
181
});
162
- })
163
- .wait_and_throw();
182
+ });
183
+
184
+ cl::sycl::event evt_copy_debug_output_d2h;
185
+
186
+ if (debug_output) {
187
+ device::details::ccl_debug_output host_output;
188
+
189
+ evt_copy_debug_output_d2h = details::get_queue(m_queue).memcpy(
190
+ &host_output, debug_output.get(),
191
+ sizeof(device::details::ccl_debug_output), {evt_run_kernel});
192
+
193
+ evt_copy_debug_output_d2h.wait_and_throw();
194
+
195
+ if (host_output.num_oversized_partitions > 0) {
196
+ std::cout << "WARNING: @clusterization_algorithm: "
197
+ << "Clustering encountered "
198
+ << host_output.num_oversized_partitions
199
+ << " oversized partitions; if this number is too large, "
200
+ "it may cause performance problems."
201
+ << std::endl;
202
+ }
203
+ }
204
+
205
+ cl::sycl::event::wait_and_throw(
206
+ {evt_run_kernel, evt_copy_debug_output_d2h});
164
207
165
208
// Return the reconstructed measurements.
166
209
return measurements;
0 commit comments