diff --git a/SYCL/GroupAlgorithm/barrier.cpp b/SYCL/GroupAlgorithm/barrier.cpp new file mode 100644 index 0000000000..c4d9238acd --- /dev/null +++ b/SYCL/GroupAlgorithm/barrier.cpp @@ -0,0 +1,157 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_80 +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// REQUIRES: cuda + +#include "CL/sycl.hpp" +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::cuda; + +void basic() { + queue q{}; + int N = 64; + std::vector data(N); + for (int i = 0; i < N; i++) { + data[i] = i; + } + { + buffer buf(data.data(), N); + + q.submit([&](handler &cgh) { + auto acc = buf.get_access(cgh); + accessor loc( + N, cgh); + accessor + loc_barrier(2, cgh); + cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> item) { + size_t idx = item.get_local_linear_id(); + loc[idx] = acc[idx]; + if (idx < 2) { + loc_barrier[idx].initialize(N); + } + item.barrier(access::fence_space::local_space); + for (int i = 0; i < N; i++) { + int val = loc[idx]; + barrier::arrival_token arr = loc_barrier[0].arrive(); + val += 1; + int dst_idx = (idx + 1) % N; + loc_barrier[0].wait(arr); + loc[dst_idx] = val; + loc_barrier[1].wait(loc_barrier[1].arrive()); + } + acc[idx] = loc[idx]; + }); + }); + } + for (int i = 0; i < N; i++) { + assert(data[i] == i + N); + } +} + +void interface() { + queue q{}; + int N = 64; + std::vector data(N, -1); + std::vector test1(N, -1); + std::vector test2(N, -1); + for (int i = 0; i < N; i++) { + data[i] = i; + } + { + buffer data_buf(data.data(), N); + buffer test1_buf(test1.data(), N); + buffer test2_buf(test2.data(), N); + + q.submit([&](handler &cgh) { + auto data_acc = data_buf.get_access(cgh); + auto test1_acc = test1_buf.get_access(cgh); + auto test2_acc = test2_buf.get_access(cgh); + accessor loc( + N, cgh); + accessor + loc_barrier(2, cgh); + cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> item) { + size_t idx = item.get_local_linear_id(); + if (idx == 0) { + loc_barrier[0].initialize(N); + } + if (idx == 1) { + loc_barrier[1].initialize(N * N); + } + item.barrier(access::fence_space::local_space); + + item.async_work_group_copy(loc.get_pointer(), data_acc.get_pointer(), + N); + loc_barrier[1].arrive_copy_async(); + barrier::arrival_token arr = loc_barrier[1].arrive_no_complete(N - 1); + loc_barrier[1].arrive_and_wait(); + + if (idx == 0) { + loc_barrier[0].invalidate(); + int *reused_barrier_space = (int *)(void *)loc_barrier.get_pointer(); + *reused_barrier_space = loc[0]; + loc[0] = 0; + } + item.barrier(access::fence_space::local_space); + if (idx == 1) { + int *reused_barrier_space = (int *)(void *)loc_barrier.get_pointer(); + loc[0] = *reused_barrier_space; + } + item.barrier(access::fence_space::local_space); + if (idx == 0) { + loc_barrier[0].initialize(N); + } + + int val = loc[idx]; + arr = loc_barrier[0].arrive(); + val = (val + 1) % N; + int dst_idx = (idx + 1) % N; + loc_barrier[0].wait(arr); + loc[dst_idx] = val; + loc_barrier[0].wait(loc_barrier[0].arrive()); + + item.async_work_group_copy(data_acc.get_pointer(), loc.get_pointer(), + N); + loc_barrier[1].arrive_copy_async_no_inc(); + loc_barrier[1].arrive_no_complete(N - 3); + arr = loc_barrier[1].arrive(); + test1_acc[idx] = loc_barrier[1].test_wait(arr); + arr = loc_barrier[1].arrive(); + item.barrier(access::fence_space::local_space); + test2_acc[idx] = loc_barrier[1].test_wait(arr); + loc_barrier[1].wait(arr); + + loc_barrier[1].arrive_no_complete(N - 6); + loc_barrier[1].arrive_and_drop_no_complete(5); + arr = loc_barrier[1].arrive_and_drop(); + loc_barrier[1].wait(arr); + + for (int i = 0; i < N - 6; i++) { + arr = loc_barrier[1].arrive(); + } + loc_barrier[1].wait(arr); + }); + }); + } + for (int i = 0; i < N; i++) { + assert(data[i] == i); + assert(test1[i] == 0); + assert(test2[i] == 1); + } +} + +int main() { + queue q; + if (!q.get_device().has(aspect::ext_oneapi_cuda_async_barrier)) { + std::cout << "Barrier is not supported by the device. Skipping test." + << std::endl; + return 0; + } + basic(); + interface(); + + return 0; +}