Skip to content

Commit 23b3266

Browse files
committed
Add debug output to clustering algorithms
In #595, I equipped the CCA code with some edge case handling which allows it to handle oversized partitions. Although this makes sure the algorithm works, it also risks to slow down execution. In order to better understand how much performance we might be losing, this commit adds the ability for the SYCL and CUDA algorithms to print some warnings if they ever encounter this edge case.
1 parent 532d885 commit 23b3266

File tree

10 files changed

+171
-21
lines changed

10 files changed

+171
-21
lines changed

core/include/traccc/clusterization/clustering_config.hpp

+7
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,13 @@ struct clustering_config {
5454
*/
5555
unsigned int backup_size_multiplier;
5656

57+
/**
58+
* @brief Flag to enforce debug output.
59+
*
60+
* @warning This will slown down the clustering algorithm.
61+
*/
62+
bool enable_debug_output;
63+
5764
/**
5865
* @brief The maximum number of cells per partition.
5966
*/

device/alpaka/src/clusterization/clusterization_algorithm.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ struct CCLKernel {
6060
partition_start, partition_end, outi, f_view,
6161
gf_view, f_backup_view, gf_backup_view,
6262
adjc_backup_view, adjv_backup_view, backup_mutex,
63-
barry_r, measurements_view, cell_links);
63+
barry_r, measurements_view, cell_links, nullptr);
6464
}
6565
};
6666

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
/**
2+
* traccc library, part of the ACTS project (R&D line)
3+
*
4+
* (c) 2024 CERN for the benefit of the ACTS project
5+
*
6+
* Mozilla Public License Version 2.0
7+
*/
8+
9+
#pragma once
10+
11+
#include <cstdint>
12+
13+
namespace traccc::device::details {
14+
struct ccl_debug_output {
15+
uint32_t num_oversized_partitions;
16+
17+
static ccl_debug_output init() {
18+
ccl_debug_output rv;
19+
20+
rv.num_oversized_partitions = 0;
21+
22+
return rv;
23+
}
24+
};
25+
} // namespace traccc::device::details

device/common/include/traccc/clusterization/device/ccl_kernel.hpp

+4-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
// Project include(s).
1111
#include "traccc/clusterization/clustering_config.hpp"
12+
#include "traccc/clusterization/device/ccl_debug_output.hpp"
1213
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
1314
#include "traccc/definitions/hints.hpp"
1415
#include "traccc/definitions/qualifiers.hpp"
@@ -53,6 +54,7 @@ namespace traccc::device {
5354
/// @param[out] measurements_view collection of measurements
5455
/// @param[out] cell_links collection of links to measurements each cell is
5556
/// put into
57+
/// @param[out] debug_output debug output location
5658
template <device::concepts::barrier barrier_t,
5759
device::concepts::thread_id1 thread_id_t>
5860
TRACCC_DEVICE inline void ccl_kernel(
@@ -68,7 +70,8 @@ TRACCC_DEVICE inline void ccl_kernel(
6870
vecmem::data::vector_view<details::index_t> adjv_backup_view,
6971
vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
7072
measurement_collection_types::view measurements_view,
71-
vecmem::data::vector_view<unsigned int> cell_links);
73+
vecmem::data::vector_view<unsigned int> cell_links,
74+
details::ccl_debug_output* debug_output);
7275

7376
} // namespace traccc::device
7477

device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp

+10-1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include "traccc/clusterization/clustering_config.hpp"
1313
#include "traccc/clusterization/device/aggregate_cluster.hpp"
14+
#include "traccc/clusterization/device/ccl_debug_output.hpp"
1415
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
1516
#include "traccc/clusterization/device/reduce_problem_cell.hpp"
1617
#include "traccc/device/concepts/barrier.hpp"
@@ -220,7 +221,8 @@ TRACCC_DEVICE inline void ccl_kernel(
220221
vecmem::data::vector_view<details::index_t> adjv_backup_view,
221222
vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
222223
measurement_collection_types::view measurements_view,
223-
vecmem::data::vector_view<unsigned int> cell_links) {
224+
vecmem::data::vector_view<unsigned int> cell_links,
225+
details::ccl_debug_output* debug_output) {
224226
// Construct device containers around the views.
225227
const cell_collection_types::const_device cells_device(cells_view);
226228
const cell_module_collection_types::const_device modules_device(
@@ -325,6 +327,13 @@ TRACCC_DEVICE inline void ccl_kernel(
325327
if (size > cfg.max_partition_size()) {
326328
if (thread_id.getLocalThreadIdX() == 0) {
327329
lock.lock();
330+
331+
if (debug_output) {
332+
vecmem::device_atomic_ref<uint32_t>
333+
num_oversized_partitions_atm(
334+
debug_output->num_oversized_partitions);
335+
num_oversized_partitions_atm.fetch_add(1);
336+
}
328337
}
329338

330339
barrier.blockBarrier();

device/cuda/src/clusterization/clusterization_algorithm.cu

+51-5
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,16 @@
66
*/
77

88
// CUDA Library include(s).
9+
#include <cuda_runtime_api.h>
10+
#include <driver_types.h>
11+
912
#include "../sanity/contiguous_on.cuh"
1013
#include "../sanity/ordered_on.cuh"
1114
#include "../utils/barrier.hpp"
1215
#include "../utils/cuda_error_handling.hpp"
1316
#include "../utils/utils.hpp"
1417
#include "traccc/clusterization/clustering_config.hpp"
18+
#include "traccc/clusterization/device/ccl_debug_output.hpp"
1519
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
1620
#include "traccc/cuda/clusterization/clusterization_algorithm.hpp"
1721
#include "traccc/cuda/utils/thread_id.hpp"
@@ -21,6 +25,9 @@
2125
// Project include(s)
2226
#include "traccc/clusterization/device/ccl_kernel.hpp"
2327

28+
// System include
29+
#include <iostream>
30+
2431
// Vecmem include(s).
2532
#include <cstring>
2633
#include <vecmem/utils/copy.hpp>
@@ -40,7 +47,8 @@ __global__ void ccl_kernel(
4047
vecmem::data::vector_view<device::details::index_t> gf_backup_view,
4148
vecmem::data::vector_view<unsigned char> adjc_backup_view,
4249
vecmem::data::vector_view<device::details::index_t> adjv_backup_view,
43-
unsigned int* backup_mutex_ptr) {
50+
unsigned int* backup_mutex_ptr,
51+
device::details::ccl_debug_output* debug_output) {
4452

4553
__shared__ std::size_t partition_start, partition_end;
4654
__shared__ std::size_t outi;
@@ -62,7 +70,7 @@ __global__ void ccl_kernel(
6270
partition_start, partition_end, outi, f_view, gf_view,
6371
f_backup_view, gf_backup_view, adjc_backup_view,
6472
adjv_backup_view, backup_mutex, barry_r,
65-
measurements_view, cell_links);
73+
measurements_view, cell_links, debug_output);
6674
}
6775

6876
} // namespace kernels
@@ -132,14 +140,52 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
132140
assert(m_config.max_cells_per_thread <=
133141
device::details::CELLS_PER_THREAD_STACK_LIMIT);
134142

143+
// If necessary, allocate an object for storing the debug information
144+
vecmem::unique_alloc_ptr<device::details::ccl_debug_output> debug_output;
145+
146+
if (m_config.enable_debug_output) {
147+
debug_output =
148+
vecmem::make_unique_alloc<device::details::ccl_debug_output>(
149+
m_mr.main);
150+
151+
device::details::ccl_debug_output empty_output =
152+
device::details::ccl_debug_output::init();
153+
154+
TRACCC_CUDA_ERROR_CHECK(
155+
cudaMemcpyAsync(debug_output.get(), &empty_output,
156+
sizeof(device::details::ccl_debug_output),
157+
cudaMemcpyHostToDevice, stream));
158+
}
159+
135160
kernels::ccl_kernel<<<num_blocks, m_config.threads_per_partition,
136161
2 * m_config.max_partition_size() *
137162
sizeof(device::details::index_t),
138-
stream>>>(
139-
m_config, cells, modules, measurements, cell_links, m_f_backup,
140-
m_gf_backup, m_adjc_backup, m_adjv_backup, m_backup_mutex.get());
163+
stream>>>(m_config, cells, modules, measurements,
164+
cell_links, m_f_backup, m_gf_backup,
165+
m_adjc_backup, m_adjv_backup,
166+
m_backup_mutex.get(), debug_output.get());
141167
TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());
142168

169+
if (debug_output) {
170+
device::details::ccl_debug_output host_output;
171+
172+
TRACCC_CUDA_ERROR_CHECK(
173+
cudaMemcpyAsync(&host_output, debug_output.get(),
174+
sizeof(device::details::ccl_debug_output),
175+
cudaMemcpyDeviceToHost, stream));
176+
177+
TRACCC_CUDA_ERROR_CHECK(cudaStreamSynchronize(stream));
178+
179+
if (host_output.num_oversized_partitions > 0) {
180+
std::cout << "WARNING: @clusterization_algorithm: "
181+
<< "Clustering encountered "
182+
<< host_output.num_oversized_partitions
183+
<< " oversized partitions; if this number is too large, "
184+
"it may cause performance problems."
185+
<< std::endl;
186+
}
187+
}
188+
143189
// Return the reconstructed measurements.
144190
return measurements;
145191
}

device/sycl/src/clusterization/clusterization_algorithm.sycl

+55-12
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "../sanity/ordered_on.hpp"
1111
#include "../utils/barrier.hpp"
1212
#include "../utils/get_queue.hpp"
13+
#include "traccc/clusterization/device/ccl_debug_output.hpp"
1314
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
1415
#include "traccc/sycl/clusterization/clusterization_algorithm.hpp"
1516
#include "traccc/sycl/utils/thread_id.hpp"
@@ -113,9 +114,28 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
113114
assert(m_config.max_cells_per_thread <=
114115
device::details::CELLS_PER_THREAD_STACK_LIMIT);
115116

117+
// If necessary, allocate an object for storing the debug information
118+
vecmem::unique_alloc_ptr<device::details::ccl_debug_output> debug_output;
119+
cl::sycl::event evt_copy_debug_output_h2d;
120+
121+
if (m_config.enable_debug_output) {
122+
debug_output =
123+
vecmem::make_unique_alloc<device::details::ccl_debug_output>(
124+
m_mr.main);
125+
126+
device::details::ccl_debug_output empty_output =
127+
device::details::ccl_debug_output::init();
128+
129+
evt_copy_debug_output_h2d = details::get_queue(m_queue).memcpy(
130+
debug_output.get(), &empty_output,
131+
sizeof(device::details::ccl_debug_output));
132+
}
133+
116134
// Run ccl kernel
117-
details::get_queue(m_queue)
118-
.submit([&](::sycl::handler& h) {
135+
cl::sycl::event evt_run_kernel =
136+
details::get_queue(m_queue).submit([&](::sycl::handler& h) {
137+
h.depends_on(evt_copy_debug_output_h2d);
138+
119139
// Allocate shared memory for the kernel.
120140
vecmem::sycl::local_accessor<std::size_t> shared_uint(3, h);
121141
vecmem::sycl::local_accessor<device::details::index_t> shared_idx(
@@ -130,8 +150,8 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
130150
gf_backup_view = vecmem::get_data(m_gf_backup),
131151
adjc_backup_view = vecmem::get_data(m_adjc_backup),
132152
adjv_backup_view = vecmem::get_data(m_adjv_backup),
133-
mutex_ptr = m_backup_mutex.get(),
134-
cfg = m_config](::sycl::nd_item<1> item) {
153+
mutex_ptr = m_backup_mutex.get(), cfg = m_config,
154+
debug_output = debug_output.get()](::sycl::nd_item<1> item) {
135155
// Construct more readable variable names.
136156
vecmem::data::vector_view<device::details::index_t> f_view{
137157
static_cast<vector_size_t>(cfg.max_partition_size()),
@@ -152,15 +172,38 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
152172
const sycl::thread_id1 thread_id(item);
153173

154174
// Run the algorithm for this thread.
155-
device::ccl_kernel(cfg, thread_id, cells_view, modules_view,
156-
partition_start, partition_end, outi,
157-
f_view, gf_view, f_backup_view,
158-
gf_backup_view, adjc_backup_view,
159-
adjv_backup_view, backup_mutex, barry_r,
160-
measurements_view, cell_links_view);
175+
device::ccl_kernel(
176+
cfg, thread_id, cells_view, modules_view,
177+
partition_start, partition_end, outi, f_view, gf_view,
178+
f_backup_view, gf_backup_view, adjc_backup_view,
179+
adjv_backup_view, backup_mutex, barry_r,
180+
measurements_view, cell_links_view, debug_output);
161181
});
162-
})
163-
.wait_and_throw();
182+
});
183+
184+
cl::sycl::event evt_copy_debug_output_d2h;
185+
186+
if (debug_output) {
187+
device::details::ccl_debug_output host_output;
188+
189+
evt_copy_debug_output_d2h = details::get_queue(m_queue).memcpy(
190+
&host_output, debug_output.get(),
191+
sizeof(device::details::ccl_debug_output), {evt_run_kernel});
192+
193+
evt_copy_debug_output_d2h.wait_and_throw();
194+
195+
if (host_output.num_oversized_partitions > 0) {
196+
std::cout << "WARNING: @clusterization_algorithm: "
197+
<< "Clustering encountered "
198+
<< host_output.num_oversized_partitions
199+
<< " oversized partitions; if this number is too large, "
200+
"it may cause performance problems."
201+
<< std::endl;
202+
}
203+
}
204+
205+
cl::sycl::event::wait_and_throw(
206+
{evt_run_kernel, evt_copy_debug_output_d2h});
164207

165208
// Return the reconstructed measurements.
166209
return measurements;

examples/options/include/traccc/options/clusterization.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ class clusterization
3737
unsigned int max_cells_per_thread;
3838
unsigned int target_cells_per_thread;
3939
unsigned int backup_size_multiplier;
40+
bool enable_debug_output;
4041
/// @}
4142

4243
/// Print the specific options of this class

examples/options/src/clusterization.cpp

+15-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,14 @@
1313
// System include(s).
1414
#include <iostream>
1515

16+
namespace {
17+
#ifndef NDEBUG
18+
constexpr bool enable_cca_debug_default = true;
19+
#else
20+
constexpr bool enable_cca_debug_default = false;
21+
#endif
22+
} // namespace
23+
1624
namespace traccc::opts {
1725

1826
clusterization::clusterization() : interface("Clusterization Options") {
@@ -33,6 +41,10 @@ clusterization::clusterization() : interface("Clusterization Options") {
3341
boost::program_options::value(&backup_size_multiplier)
3442
->default_value(256),
3543
"The size multiplier of the backup scratch space");
44+
m_desc.add_options()("cca-debug",
45+
boost::program_options::value(&enable_debug_output)
46+
->default_value(enable_cca_debug_default),
47+
"The size multiplier of the backup scratch space");
3648
}
3749

3850
clusterization::operator clustering_config() const {
@@ -42,6 +54,7 @@ clusterization::operator clustering_config() const {
4254
rv.max_cells_per_thread = max_cells_per_thread;
4355
rv.target_cells_per_thread = target_cells_per_thread;
4456
rv.backup_size_multiplier = backup_size_multiplier;
57+
rv.enable_debug_output = enable_debug_output;
4558

4659
return rv;
4760
}
@@ -54,7 +67,8 @@ std::ostream& clusterization::print_impl(std::ostream& out) const {
5467
out << " Threads per partition: " << threads_per_partition << "\n";
5568
out << " Target cells per thread: " << target_cells_per_thread << "\n";
5669
out << " Max cells per thread: " << max_cells_per_thread << "\n";
57-
out << " Scratch space size mult.: " << backup_size_multiplier;
70+
out << " Scratch space size mult.: " << backup_size_multiplier << "\n";
71+
out << " Debug output printing: " << enable_debug_output << "\n";
5872
return out;
5973
}
6074

tests/common/tests/cca_test.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ inline traccc::clustering_config default_ccl_test_config() {
4343
rv.max_cells_per_thread = 16;
4444
rv.target_cells_per_thread = 8;
4545
rv.backup_size_multiplier = 256;
46+
rv.enable_debug_output = false;
4647

4748
return rv;
4849
}
@@ -54,6 +55,7 @@ inline traccc::clustering_config tiny_ccl_test_config() {
5455
rv.max_cells_per_thread = 1;
5556
rv.target_cells_per_thread = 1;
5657
rv.backup_size_multiplier = 16384;
58+
rv.enable_debug_output = false;
5759

5860
return rv;
5961
}

0 commit comments

Comments
 (0)