Skip to content

Commit 7cd882e

Browse files
authoredJun 18, 2024
Merge pull request acts-project#610 from stephenswat/feat/barrier_and_count
Add blockwise add and count in CUDA and SYCL
·
2 parents 5e8af79 + fbe62d8 commit 7cd882e

File tree

8 files changed

+335
-3
lines changed

8 files changed

+335
-3
lines changed
 

‎device/cuda/src/utils/barrier.hpp

+6
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,14 @@ struct barrier {
1616
TRACCC_DEVICE
1717
void blockBarrier() { __syncthreads(); }
1818

19+
TRACCC_DEVICE
20+
bool blockAnd(bool predicate) { return __syncthreads_and(predicate); }
21+
1922
TRACCC_DEVICE
2023
bool blockOr(bool predicate) { return __syncthreads_or(predicate); }
24+
25+
TRACCC_DEVICE
26+
int blockCount(bool predicate) { return __syncthreads_count(predicate); }
2127
};
2228

2329
} // namespace traccc::cuda

‎device/sycl/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ traccc_add_library( traccc_sycl sycl TYPE SHARED
2525
"include/traccc/sycl/utils/queue_wrapper.hpp"
2626
"include/traccc/sycl/utils/calculate1DimNdRange.hpp"
2727
"include/traccc/sycl/utils/make_prefix_sum_buff.hpp"
28-
"include/traccc/sycl/utils/barrier.hpp"
2928
# implementation files
3029
"src/clusterization/clusterization_algorithm.sycl"
3130
"src/clusterization/spacepoint_formation_algorithm.sycl"
@@ -35,6 +34,7 @@ traccc_add_library( traccc_sycl sycl TYPE SHARED
3534
"src/seeding/seeding_algorithm.cpp"
3635
"src/seeding/spacepoint_binning.sycl"
3736
"src/seeding/track_params_estimation.sycl"
37+
"src/utils/barrier.hpp"
3838
"src/utils/get_queue.hpp"
3939
"src/utils/get_queue.sycl"
4040
"src/utils/queue_wrapper.cpp"

‎device/sycl/src/clusterization/clusterization_algorithm.sycl

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
*/
77

88
// Local include(s).
9+
#include "../utils/barrier.hpp"
910
#include "../utils/get_queue.hpp"
1011
#include "traccc/sycl/clusterization/clusterization_algorithm.hpp"
11-
#include "traccc/sycl/utils/barrier.hpp"
1212

1313
// Project include(s)
1414
#include "traccc/clusterization/device/ccl_kernel.hpp"

‎device/sycl/include/traccc/sycl/utils/barrier.hpp ‎device/sycl/src/utils/barrier.hpp

+17-1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
// Project include(s).
1111
#include "traccc/definitions/qualifiers.hpp"
1212

13+
// SYCL includes
14+
#include <CL/sycl.hpp>
15+
1316
namespace traccc::sycl {
1417

1518
struct barrier {
@@ -18,14 +21,27 @@ struct barrier {
1821
TRACCC_DEVICE
1922
void blockBarrier() { m_item.barrier(); }
2023

24+
TRACCC_DEVICE
25+
bool blockAnd(bool predicate) {
26+
m_item.barrier();
27+
return ::sycl::all_of_group(m_item.get_group(), predicate);
28+
}
29+
2130
TRACCC_DEVICE
2231
bool blockOr(bool predicate) {
2332
m_item.barrier();
2433
return ::sycl::any_of_group(m_item.get_group(), predicate);
2534
}
2635

36+
TRACCC_DEVICE
37+
unsigned int blockCount(bool predicate) {
38+
m_item.barrier();
39+
return ::sycl::reduce_over_group(m_item.get_group(),
40+
predicate ? 1u : 0u, ::sycl::plus<>());
41+
}
42+
2743
private:
2844
::sycl::nd_item<1> m_item;
2945
};
3046

31-
} // namespace traccc::sycl
47+
} // namespace traccc::sycl

‎tests/cuda/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ traccc_add_test(
2828

2929
# Define the sources for the test.
3030
test_basic.cu
31+
test_barrier.cu
3132
test_cca.cpp
3233
test_ckf_combinatorics_telescope.cpp
3334
test_ckf_toy_detector.cpp

‎tests/cuda/test_barrier.cu

+146
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
/**
2+
* traccc library, part of the ACTS project (R&D line)
3+
*
4+
* (c) 2024 CERN for the benefit of the ACTS project
5+
*
6+
* Mozilla Public License Version 2.0
7+
*/
8+
9+
#include <gtest/gtest.h>
10+
11+
#include <vecmem/memory/cuda/managed_memory_resource.hpp>
12+
#include <vecmem/memory/unique_ptr.hpp>
13+
14+
#include "../../device/cuda/src/utils/barrier.hpp"
15+
16+
__global__ void testBarrierAnd(bool* out) {
17+
traccc::cuda::barrier bar;
18+
19+
bool v;
20+
21+
v = bar.blockAnd(false);
22+
if (threadIdx.x == 0) {
23+
out[0] = v;
24+
}
25+
26+
v = bar.blockAnd(true);
27+
if (threadIdx.x == 0) {
28+
out[1] = v;
29+
}
30+
31+
v = bar.blockAnd(threadIdx.x % 2 == 0);
32+
if (threadIdx.x == 0) {
33+
out[2] = v;
34+
}
35+
36+
v = bar.blockAnd(threadIdx.x < 32);
37+
if (threadIdx.x == 0) {
38+
out[3] = v;
39+
}
40+
}
41+
42+
TEST(CUDABarrier, BarrierAnd) {
43+
vecmem::cuda::managed_memory_resource mr;
44+
constexpr std::size_t n_bools = 4;
45+
46+
vecmem::unique_alloc_ptr<bool[]> out =
47+
vecmem::make_unique_alloc<bool[]>(mr, n_bools);
48+
49+
testBarrierAnd<<<1, 1024>>>(out.get());
50+
51+
ASSERT_EQ(cudaGetLastError(), cudaSuccess);
52+
ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
53+
54+
EXPECT_FALSE(out.get()[0]);
55+
EXPECT_TRUE(out.get()[1]);
56+
EXPECT_FALSE(out.get()[2]);
57+
EXPECT_FALSE(out.get()[3]);
58+
}
59+
60+
__global__ void testBarrierOr(bool* out) {
61+
traccc::cuda::barrier bar;
62+
63+
bool v;
64+
65+
v = bar.blockOr(false);
66+
if (threadIdx.x == 0) {
67+
out[0] = v;
68+
}
69+
70+
v = bar.blockOr(true);
71+
if (threadIdx.x == 0) {
72+
out[1] = v;
73+
}
74+
75+
v = bar.blockOr(threadIdx.x % 2 == 0);
76+
if (threadIdx.x == 0) {
77+
out[2] = v;
78+
}
79+
80+
v = bar.blockOr(threadIdx.x < 32);
81+
if (threadIdx.x == 0) {
82+
out[3] = v;
83+
}
84+
}
85+
86+
TEST(CUDABarrier, BarrierOr) {
87+
vecmem::cuda::managed_memory_resource mr;
88+
constexpr std::size_t n_bools = 4;
89+
90+
vecmem::unique_alloc_ptr<bool[]> out =
91+
vecmem::make_unique_alloc<bool[]>(mr, n_bools);
92+
93+
testBarrierOr<<<1, 1024>>>(out.get());
94+
95+
ASSERT_EQ(cudaGetLastError(), cudaSuccess);
96+
ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
97+
98+
EXPECT_FALSE(out.get()[0]);
99+
EXPECT_TRUE(out.get()[1]);
100+
EXPECT_TRUE(out.get()[2]);
101+
EXPECT_TRUE(out.get()[3]);
102+
}
103+
104+
__global__ void testBarrierCount(int* out) {
105+
traccc::cuda::barrier bar;
106+
107+
int v;
108+
109+
v = bar.blockOr(false);
110+
if (threadIdx.x == 0) {
111+
out[0] = v;
112+
}
113+
114+
v = bar.blockOr(true);
115+
if (threadIdx.x == 0) {
116+
out[1] = v;
117+
}
118+
119+
v = bar.blockOr(threadIdx.x % 2 == 0);
120+
if (threadIdx.x == 0) {
121+
out[2] = v;
122+
}
123+
124+
v = bar.blockOr(threadIdx.x < 32);
125+
if (threadIdx.x == 0) {
126+
out[3] = v;
127+
}
128+
}
129+
130+
TEST(CUDABarrier, BarrierCount) {
131+
vecmem::cuda::managed_memory_resource mr;
132+
constexpr std::size_t n_ints = 4;
133+
134+
vecmem::unique_alloc_ptr<int[]> out =
135+
vecmem::make_unique_alloc<int[]>(mr, n_ints);
136+
137+
testBarrierCount<<<1, 1024>>>(out.get());
138+
139+
ASSERT_EQ(cudaGetLastError(), cudaSuccess);
140+
ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
141+
142+
EXPECT_EQ(out.get()[0], 0);
143+
EXPECT_EQ(out.get()[1], 1024);
144+
EXPECT_EQ(out.get()[2], 512);
145+
EXPECT_EQ(out.get()[3], 32);
146+
}

‎tests/sycl/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ traccc_add_test(
1515
test_kalman_fitter_telescope.sycl
1616
test_clusterization.sycl
1717
test_spacepoint_formation.sycl
18+
test_barrier.sycl
1819

1920
LINK_LIBRARIES
2021
GTest::gtest_main

‎tests/sycl/test_barrier.sycl

+162
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
/**
2+
* traccc library, part of the ACTS project (R&D line)
3+
*
4+
* (c) 2024 CERN for the benefit of the ACTS project
5+
*
6+
* Mozilla Public License Version 2.0
7+
*/
8+
9+
#include <gtest/gtest.h>
10+
11+
#include <CL/sycl.hpp>
12+
#include <vecmem/memory/sycl/shared_memory_resource.hpp>
13+
#include <vecmem/memory/unique_ptr.hpp>
14+
15+
#include "../../device/sycl/src/utils/barrier.hpp"
16+
17+
TEST(SYCLBarrier, BarrierAnd) {
18+
vecmem::sycl::shared_memory_resource mr;
19+
20+
cl::sycl::queue queue;
21+
constexpr std::size_t n_bools = 4;
22+
23+
vecmem::unique_alloc_ptr<bool[]> out =
24+
vecmem::make_unique_alloc<bool[]>(mr, n_bools);
25+
26+
cl::sycl::nd_range test_range(cl::sycl::range<1>(128),
27+
cl::sycl::range<1>(128));
28+
29+
queue
30+
.submit([&, out = out.get()](cl::sycl::handler &h) {
31+
h.parallel_for<class BarrierAndTest>(
32+
test_range, [=](cl::sycl::nd_item<1> item) {
33+
traccc::sycl::barrier bar(item);
34+
35+
bool v;
36+
37+
v = bar.blockAnd(false);
38+
if (item.get_local_id() == 0) {
39+
out[0] = v;
40+
}
41+
42+
v = bar.blockAnd(true);
43+
if (item.get_local_id() == 0) {
44+
out[1] = v;
45+
}
46+
47+
v = bar.blockAnd(item.get_local_id() % 2 == 0);
48+
if (item.get_local_id() == 0) {
49+
out[2] = v;
50+
}
51+
52+
v = bar.blockAnd(item.get_local_id() < 32);
53+
if (item.get_local_id() == 0) {
54+
out[3] = v;
55+
}
56+
});
57+
})
58+
.wait_and_throw();
59+
60+
EXPECT_FALSE(out.get()[0]);
61+
EXPECT_TRUE(out.get()[1]);
62+
EXPECT_FALSE(out.get()[2]);
63+
EXPECT_FALSE(out.get()[3]);
64+
}
65+
66+
TEST(SYCLBarrier, BarrierOr) {
67+
vecmem::sycl::shared_memory_resource mr;
68+
69+
cl::sycl::queue queue;
70+
constexpr std::size_t n_bools = 4;
71+
72+
vecmem::unique_alloc_ptr<bool[]> out =
73+
vecmem::make_unique_alloc<bool[]>(mr, n_bools);
74+
75+
cl::sycl::nd_range test_range(cl::sycl::range<1>(128),
76+
cl::sycl::range<1>(128));
77+
78+
queue
79+
.submit([&, out = out.get()](cl::sycl::handler &h) {
80+
h.parallel_for<class BarrierOrTest>(
81+
test_range, [=](cl::sycl::nd_item<1> item) {
82+
traccc::sycl::barrier bar(item);
83+
84+
bool v;
85+
86+
v = bar.blockOr(false);
87+
if (item.get_local_id() == 0) {
88+
out[0] = v;
89+
}
90+
91+
v = bar.blockOr(true);
92+
if (item.get_local_id() == 0) {
93+
out[1] = v;
94+
}
95+
96+
v = bar.blockOr(item.get_local_id() % 2 == 0);
97+
if (item.get_local_id() == 0) {
98+
out[2] = v;
99+
}
100+
101+
v = bar.blockOr(item.get_local_id() < 32);
102+
if (item.get_local_id() == 0) {
103+
out[3] = v;
104+
}
105+
});
106+
})
107+
.wait_and_throw();
108+
109+
EXPECT_FALSE(out.get()[0]);
110+
EXPECT_TRUE(out.get()[1]);
111+
EXPECT_TRUE(out.get()[2]);
112+
EXPECT_TRUE(out.get()[3]);
113+
}
114+
115+
TEST(SYCLBarrier, BarrierCount) {
116+
vecmem::sycl::shared_memory_resource mr;
117+
118+
cl::sycl::queue queue;
119+
constexpr std::size_t n_ints = 4;
120+
121+
vecmem::unique_alloc_ptr<int[]> out =
122+
vecmem::make_unique_alloc<int[]>(mr, n_ints);
123+
124+
cl::sycl::nd_range test_range(cl::sycl::range<1>(128),
125+
cl::sycl::range<1>(128));
126+
127+
queue
128+
.submit([&, out = out.get()](cl::sycl::handler &h) {
129+
h.parallel_for<class BarrierCountTest>(
130+
test_range, [=](cl::sycl::nd_item<1> item) {
131+
traccc::sycl::barrier bar(item);
132+
133+
int v;
134+
135+
v = bar.blockCount(false);
136+
if (item.get_local_id() == 0) {
137+
out[0] = v;
138+
}
139+
140+
v = bar.blockCount(true);
141+
if (item.get_local_id() == 0) {
142+
out[1] = v;
143+
}
144+
145+
v = bar.blockCount(item.get_local_id() % 2 == 0);
146+
if (item.get_local_id() == 0) {
147+
out[2] = v;
148+
}
149+
150+
v = bar.blockCount(item.get_local_id() < 32);
151+
if (item.get_local_id() == 0) {
152+
out[3] = v;
153+
}
154+
});
155+
})
156+
.wait_and_throw();
157+
158+
EXPECT_EQ(out.get()[0], 0);
159+
EXPECT_EQ(out.get()[1], 128);
160+
EXPECT_EQ(out.get()[2], 64);
161+
EXPECT_EQ(out.get()[3], 32);
162+
}

0 commit comments

Comments
 (0)
Please sign in to comment.