From 72ee78b14db5fc922d1175d1dfdf0658d2b27eda Mon Sep 17 00:00:00 2001 From: Tadej Ciglaric Date: Thu, 13 Jan 2022 11:57:16 +0000 Subject: [PATCH 1/7] added tests for async barrier --- SYCL/GroupAlgorithm/barrier.cpp | 157 ++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100755 SYCL/GroupAlgorithm/barrier.cpp diff --git a/SYCL/GroupAlgorithm/barrier.cpp b/SYCL/GroupAlgorithm/barrier.cpp new file mode 100755 index 0000000000..4498ea2a8c --- /dev/null +++ b/SYCL/GroupAlgorithm/barrier.cpp @@ -0,0 +1,157 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_80 +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// XFAIL: opencl, esimd, level0, hip + +#include "CL/sycl.hpp" +//#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi; + +void basic() { + queue q{}; + int N=64; + std::vector data(N); + for(int i=0;i buf(data.data(), N); + + q.submit([&](handler &cgh) { + auto acc = buf.get_access(cgh); + accessor + loc(N, cgh); + accessor + loc_barrier(2, cgh); + cgh.parallel_for( + nd_range<1>(N, N), [=](nd_item<1> item) { + size_t idx = item.get_local_linear_id(); + loc[idx]=acc[idx]; + if(idx<=2){ + loc_barrier[idx].initialize(N); + } + item.barrier(access::fence_space::local_space); + for(int i=0;i data(N,-1); + std::vector test1(N,-1); + std::vector test2(N,-1); + for(int i=0;i data_buf(data.data(), N); + buffer test1_buf(test1.data(), N); + buffer test2_buf(test2.data(), N); + + q.submit([&](handler &cgh) { + auto data_acc = data_buf.get_access(cgh); + auto test1_acc = test1_buf.get_access(cgh); + auto test2_acc = test2_buf.get_access(cgh); + accessor + loc(N, cgh); + accessor + loc_barrier(2, cgh); + stream s(1024*1024,1024,cgh); + cgh.parallel_for( + nd_range<1>(N, N), [=](nd_item<1> item) { + size_t idx = item.get_local_linear_id(); + if(idx==0){ + loc_barrier[0].initialize(N); + } + if(idx==1){ + loc_barrier[1].initialize(N*N); + } + item.barrier(access::fence_space::local_space); + + item.async_work_group_copy(loc.get_pointer(), data_acc.get_pointer(), N); + loc_barrier[1].arrive_copy_async(); + barrier::arrival_token arr = loc_barrier[1].arrive_no_complete(N-1); + loc_barrier[1].arrive_and_wait(); + + if(idx==0){ + loc_barrier[0].invalidate(); + int* reused_barrier_space = (int*)(void*)loc_barrier.get_pointer(); + *reused_barrier_space=loc[0]; + loc[0]=0; + } + item.barrier(access::fence_space::local_space); + if(idx==1){ + int* reused_barrier_space = (int*)(void*)loc_barrier.get_pointer(); + loc[0] = *reused_barrier_space; + } + item.barrier(access::fence_space::local_space); + if(idx==0){ + loc_barrier[0].initialize(N); + } + + int val = loc[idx]; + arr = loc_barrier[0].arrive(); + val = (val + 1) % N; + int dst_idx = (idx + 1) % N; + loc_barrier[0].wait(arr); + loc[dst_idx] = val; + loc_barrier[0].wait(loc_barrier[0].arrive()); + + item.async_work_group_copy(data_acc.get_pointer(), loc.get_pointer(), N); + loc_barrier[1].arrive_copy_async_no_inc(); + loc_barrier[1].arrive_no_complete(N-3); + arr = loc_barrier[1].arrive(); + test1_acc[idx] = loc_barrier[1].test_wait(arr); + arr = loc_barrier[1].arrive(); + item.barrier(access::fence_space::local_space); + test2_acc[idx] = loc_barrier[1].test_wait(arr); + loc_barrier[1].wait(arr); + + loc_barrier[1].arrive_no_complete(N-6); + loc_barrier[1].arrive_and_drop_no_complete(5); + arr = loc_barrier[1].arrive_and_drop(); + loc_barrier[1].wait(arr); + + for(int i=0;i Date: Thu, 13 Jan 2022 14:10:02 +0000 Subject: [PATCH 2/7] format --- SYCL/GroupAlgorithm/barrier.cpp | 206 ++++++++++++++++---------------- 1 file changed, 102 insertions(+), 104 deletions(-) diff --git a/SYCL/GroupAlgorithm/barrier.cpp b/SYCL/GroupAlgorithm/barrier.cpp index 4498ea2a8c..9fdb808d8f 100755 --- a/SYCL/GroupAlgorithm/barrier.cpp +++ b/SYCL/GroupAlgorithm/barrier.cpp @@ -7,62 +7,61 @@ #include "CL/sycl.hpp" //#include -#include #include +#include using namespace sycl; using namespace sycl::ext::oneapi; void basic() { queue q{}; - int N=64; + int N = 64; std::vector data(N); - for(int i=0;i buf(data.data(), N); q.submit([&](handler &cgh) { auto acc = buf.get_access(cgh); - accessor - loc(N, cgh); + accessor loc( + N, cgh); accessor loc_barrier(2, cgh); - cgh.parallel_for( - nd_range<1>(N, N), [=](nd_item<1> item) { - size_t idx = item.get_local_linear_id(); - loc[idx]=acc[idx]; - if(idx<=2){ - loc_barrier[idx].initialize(N); - } - item.barrier(access::fence_space::local_space); - for(int i=0;i(N, N), [=](nd_item<1> item) { + size_t idx = item.get_local_linear_id(); + loc[idx] = acc[idx]; + if (idx <= 2) { + loc_barrier[idx].initialize(N); + } + item.barrier(access::fence_space::local_space); + for (int i = 0; i < N; i++) { + int val = loc[idx]; + barrier::arrival_token arr = loc_barrier[0].arrive(); + val += 1; + int dst_idx = (idx + 1) % N; + loc_barrier[0].wait(arr); + loc[dst_idx] = val; + loc_barrier[1].wait(loc_barrier[1].arrive()); + } + acc[idx] = loc[idx]; + }); }); } - for(int i=0;i data(N,-1); - std::vector test1(N,-1); - std::vector test2(N,-1); - for(int i=0;i data(N, -1); + std::vector test1(N, -1); + std::vector test2(N, -1); + for (int i = 0; i < N; i++) { + data[i] = i; } { buffer data_buf(data.data(), N); @@ -73,82 +72,81 @@ void interface() { auto data_acc = data_buf.get_access(cgh); auto test1_acc = test1_buf.get_access(cgh); auto test2_acc = test2_buf.get_access(cgh); - accessor - loc(N, cgh); + accessor loc( + N, cgh); accessor loc_barrier(2, cgh); - stream s(1024*1024,1024,cgh); - cgh.parallel_for( - nd_range<1>(N, N), [=](nd_item<1> item) { - size_t idx = item.get_local_linear_id(); - if(idx==0){ - loc_barrier[0].initialize(N); - } - if(idx==1){ - loc_barrier[1].initialize(N*N); - } - item.barrier(access::fence_space::local_space); - - item.async_work_group_copy(loc.get_pointer(), data_acc.get_pointer(), N); - loc_barrier[1].arrive_copy_async(); - barrier::arrival_token arr = loc_barrier[1].arrive_no_complete(N-1); - loc_barrier[1].arrive_and_wait(); - - if(idx==0){ - loc_barrier[0].invalidate(); - int* reused_barrier_space = (int*)(void*)loc_barrier.get_pointer(); - *reused_barrier_space=loc[0]; - loc[0]=0; - } - item.barrier(access::fence_space::local_space); - if(idx==1){ - int* reused_barrier_space = (int*)(void*)loc_barrier.get_pointer(); - loc[0] = *reused_barrier_space; - } - item.barrier(access::fence_space::local_space); - if(idx==0){ - loc_barrier[0].initialize(N); - } - - int val = loc[idx]; - arr = loc_barrier[0].arrive(); - val = (val + 1) % N; - int dst_idx = (idx + 1) % N; - loc_barrier[0].wait(arr); - loc[dst_idx] = val; - loc_barrier[0].wait(loc_barrier[0].arrive()); - - item.async_work_group_copy(data_acc.get_pointer(), loc.get_pointer(), N); - loc_barrier[1].arrive_copy_async_no_inc(); - loc_barrier[1].arrive_no_complete(N-3); - arr = loc_barrier[1].arrive(); - test1_acc[idx] = loc_barrier[1].test_wait(arr); - arr = loc_barrier[1].arrive(); - item.barrier(access::fence_space::local_space); - test2_acc[idx] = loc_barrier[1].test_wait(arr); - loc_barrier[1].wait(arr); - - loc_barrier[1].arrive_no_complete(N-6); - loc_barrier[1].arrive_and_drop_no_complete(5); - arr = loc_barrier[1].arrive_and_drop(); - loc_barrier[1].wait(arr); - - for(int i=0;i(N, N), [=](nd_item<1> item) { + size_t idx = item.get_local_linear_id(); + if (idx == 0) { + loc_barrier[0].initialize(N); + } + if (idx == 1) { + loc_barrier[1].initialize(N * N); + } + item.barrier(access::fence_space::local_space); + + item.async_work_group_copy(loc.get_pointer(), data_acc.get_pointer(), + N); + loc_barrier[1].arrive_copy_async(); + barrier::arrival_token arr = loc_barrier[1].arrive_no_complete(N - 1); + loc_barrier[1].arrive_and_wait(); + + if (idx == 0) { + loc_barrier[0].invalidate(); + int *reused_barrier_space = (int *)(void *)loc_barrier.get_pointer(); + *reused_barrier_space = loc[0]; + loc[0] = 0; + } + item.barrier(access::fence_space::local_space); + if (idx == 1) { + int *reused_barrier_space = (int *)(void *)loc_barrier.get_pointer(); + loc[0] = *reused_barrier_space; + } + item.barrier(access::fence_space::local_space); + if (idx == 0) { + loc_barrier[0].initialize(N); + } + + int val = loc[idx]; + arr = loc_barrier[0].arrive(); + val = (val + 1) % N; + int dst_idx = (idx + 1) % N; + loc_barrier[0].wait(arr); + loc[dst_idx] = val; + loc_barrier[0].wait(loc_barrier[0].arrive()); + + item.async_work_group_copy(data_acc.get_pointer(), loc.get_pointer(), + N); + loc_barrier[1].arrive_copy_async_no_inc(); + loc_barrier[1].arrive_no_complete(N - 3); + arr = loc_barrier[1].arrive(); + test1_acc[idx] = loc_barrier[1].test_wait(arr); + arr = loc_barrier[1].arrive(); + item.barrier(access::fence_space::local_space); + test2_acc[idx] = loc_barrier[1].test_wait(arr); + loc_barrier[1].wait(arr); + + loc_barrier[1].arrive_no_complete(N - 6); + loc_barrier[1].arrive_and_drop_no_complete(5); + arr = loc_barrier[1].arrive_and_drop(); + loc_barrier[1].wait(arr); + + for (int i = 0; i < N - 6; i++) { + arr = loc_barrier[1].arrive(); + } + loc_barrier[1].wait(arr); + }); }); } - for(int i=0;i Date: Wed, 26 Jan 2022 07:34:37 +0000 Subject: [PATCH 3/7] updated namespace --- SYCL/GroupAlgorithm/barrier.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SYCL/GroupAlgorithm/barrier.cpp b/SYCL/GroupAlgorithm/barrier.cpp index 9fdb808d8f..550e3e0e1e 100755 --- a/SYCL/GroupAlgorithm/barrier.cpp +++ b/SYCL/GroupAlgorithm/barrier.cpp @@ -11,7 +11,7 @@ #include using namespace sycl; -using namespace sycl::ext::oneapi; +using namespace sycl::ext::oneapi::cuda; void basic() { queue q{}; From a1f3aae52855e46bbe65e3b5edfedffe569c4971 Mon Sep 17 00:00:00 2001 From: Tadej Ciglaric Date: Wed, 9 Feb 2022 09:55:03 +0000 Subject: [PATCH 4/7] adapt tests to making the extension CUDA-only --- SYCL/GroupAlgorithm/barrier.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/SYCL/GroupAlgorithm/barrier.cpp b/SYCL/GroupAlgorithm/barrier.cpp index 550e3e0e1e..da296f1d55 100755 --- a/SYCL/GroupAlgorithm/barrier.cpp +++ b/SYCL/GroupAlgorithm/barrier.cpp @@ -1,12 +1,9 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_80 -// RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out -// XFAIL: opencl, esimd, level0, hip +// REQUIRES: cuda #include "CL/sycl.hpp" -//#include #include #include From 08755e12bfa9cd6ebacb0a8e99386e2e054b8441 Mon Sep 17 00:00:00 2001 From: Tadej Ciglaric Date: Tue, 8 Mar 2022 08:46:42 +0000 Subject: [PATCH 5/7] changed namespace and added aspect check --- SYCL/GroupAlgorithm/barrier.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/SYCL/GroupAlgorithm/barrier.cpp b/SYCL/GroupAlgorithm/barrier.cpp index da296f1d55..0621f3f618 100755 --- a/SYCL/GroupAlgorithm/barrier.cpp +++ b/SYCL/GroupAlgorithm/barrier.cpp @@ -8,7 +8,7 @@ #include using namespace sycl; -using namespace sycl::ext::oneapi::cuda; +using namespace sycl::ext::oneapi::experimental::cuda; void basic() { queue q{}; @@ -145,6 +145,11 @@ void interface() { } int main() { + queue q; + if(!q.get_device().has(aspect::ext_oneapi_cuda_async_barrier)){ + std::cout << "Barrier is not supported by the device. Skipping test." << std::endl; + return 0; + } basic(); interface(); From ef5e17d10799cc5f7d291d53ffa2a541a224d55d Mon Sep 17 00:00:00 2001 From: Tadej Ciglaric Date: Tue, 8 Mar 2022 08:54:22 +0000 Subject: [PATCH 6/7] format --- SYCL/GroupAlgorithm/barrier.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) mode change 100755 => 100644 SYCL/GroupAlgorithm/barrier.cpp diff --git a/SYCL/GroupAlgorithm/barrier.cpp b/SYCL/GroupAlgorithm/barrier.cpp old mode 100755 new mode 100644 index 0621f3f618..fdf97c2271 --- a/SYCL/GroupAlgorithm/barrier.cpp +++ b/SYCL/GroupAlgorithm/barrier.cpp @@ -146,8 +146,9 @@ void interface() { int main() { queue q; - if(!q.get_device().has(aspect::ext_oneapi_cuda_async_barrier)){ - std::cout << "Barrier is not supported by the device. Skipping test." << std::endl; + if (!q.get_device().has(aspect::ext_oneapi_cuda_async_barrier)) { + std::cout << "Barrier is not supported by the device. Skipping test." + << std::endl; return 0; } basic(); From 1a7173166d3ddb3970424ba1bfd5883b3e7f092d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Tue, 3 May 2022 13:37:25 +0100 Subject: [PATCH 7/7] addressed review comments --- SYCL/GroupAlgorithm/barrier.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/SYCL/GroupAlgorithm/barrier.cpp b/SYCL/GroupAlgorithm/barrier.cpp index fdf97c2271..c4d9238acd 100644 --- a/SYCL/GroupAlgorithm/barrier.cpp +++ b/SYCL/GroupAlgorithm/barrier.cpp @@ -29,7 +29,7 @@ void basic() { cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> item) { size_t idx = item.get_local_linear_id(); loc[idx] = acc[idx]; - if (idx <= 2) { + if (idx < 2) { loc_barrier[idx].initialize(N); } item.barrier(access::fence_space::local_space); @@ -73,7 +73,6 @@ void interface() { N, cgh); accessor loc_barrier(2, cgh); - stream s(1024 * 1024, 1024, cgh); cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> item) { size_t idx = item.get_local_linear_id(); if (idx == 0) { @@ -155,4 +154,4 @@ int main() { interface(); return 0; -} \ No newline at end of file +}