From 0dc42423036fecf85871e0e705a2c506701ff559 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Thu, 16 Feb 2023 11:32:40 -0800 Subject: [PATCH 1/4] [SYCL] Add basic tests for non-uniform groups Tests the ability to create an instance of each new group type, and the correctness of the core member functions. Signed-off-by: John Pennycook --- SYCL/NonUniformGroups/ballot_group.cpp | 59 ++++++++++++++++ SYCL/NonUniformGroups/cluster_group.cpp | 63 +++++++++++++++++ SYCL/NonUniformGroups/is_fixed_topology.cpp | 12 ++++ SYCL/NonUniformGroups/is_user_constructed.cpp | 14 ++++ SYCL/NonUniformGroups/opportunistic_group.cpp | 69 +++++++++++++++++++ SYCL/NonUniformGroups/tangle_group.cpp | 69 +++++++++++++++++++ 6 files changed, 286 insertions(+) create mode 100644 SYCL/NonUniformGroups/ballot_group.cpp create mode 100644 SYCL/NonUniformGroups/cluster_group.cpp create mode 100644 SYCL/NonUniformGroups/is_fixed_topology.cpp create mode 100644 SYCL/NonUniformGroups/is_user_constructed.cpp create mode 100644 SYCL/NonUniformGroups/opportunistic_group.cpp create mode 100644 SYCL/NonUniformGroups/tangle_group.cpp diff --git a/SYCL/NonUniformGroups/ballot_group.cpp b/SYCL/NonUniformGroups/ballot_group.cpp new file mode 100644 index 0000000000..903de7689b --- /dev/null +++ b/SYCL/NonUniformGroups/ballot_group.cpp @@ -0,0 +1,59 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +class TestKernel; + +int main() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + return 0; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Split into odd and even work-items + bool Predicate = item.get_global_id() % 2 == 0; + auto BallotGroup = syclex::get_ballot_group(SG, Predicate); + + // Check function return values match Predicate + bool Match = true; + auto GroupID = (Predicate) ? 1 : 0; + Match &= (BallotGroup.get_group_id() == GroupID); + Match &= (BallotGroup.get_local_id() == SG.get_local_id() / 2); + Match &= (BallotGroup.get_group_range() == 2); + Match &= (BallotGroup.get_local_range() == 16); + MatchAcc[WI] = Match; + LeaderAcc[WI] = BallotGroup.leader(); + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == (WI < 2)); + } + return 0; +} diff --git a/SYCL/NonUniformGroups/cluster_group.cpp b/SYCL/NonUniformGroups/cluster_group.cpp new file mode 100644 index 0000000000..72a38baca2 --- /dev/null +++ b/SYCL/NonUniformGroups/cluster_group.cpp @@ -0,0 +1,63 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +template class TestKernel; + +template void test() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + auto ClusterGroup = syclex::get_cluster_group(SG); + + bool Match = true; + Match &= (ClusterGroup.get_group_id() == (WI / ClusterSize)); + Match &= (ClusterGroup.get_local_id() == (WI % ClusterSize)); + Match &= (ClusterGroup.get_group_range() == (32 / ClusterSize)); + Match &= (ClusterGroup.get_local_range() == ClusterSize); + MatchAcc[WI] = Match; + LeaderAcc[WI] = ClusterGroup.leader(); + }; + CGH.parallel_for>(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == ((WI % ClusterSize) == 0)); + } +} + +int main() { + test<1>(); + test<2>(); + test<4>(); + test<8>(); + test<16>(); + test<32>(); + return 0; +} diff --git a/SYCL/NonUniformGroups/is_fixed_topology.cpp b/SYCL/NonUniformGroups/is_fixed_topology.cpp new file mode 100644 index 0000000000..f02f8da6ae --- /dev/null +++ b/SYCL/NonUniformGroups/is_fixed_topology.cpp @@ -0,0 +1,12 @@ +// RUN: %clangxx -fsycl fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out + +#include +namespace syclex = sycl::ext::oneapi::experimental; + +#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP +static_assert(syclex::is_fixed_topology_group_v); +#endif +static_assert(syclex::is_fixed_topology_group_v>); +static_assert(syclex::is_fixed_topology_group_v>); +static_assert(syclex::is_fixed_topology_group_v>); +static_assert(syclex::is_fixed_topology_group_v); diff --git a/SYCL/NonUniformGroups/is_user_constructed.cpp b/SYCL/NonUniformGroups/is_user_constructed.cpp new file mode 100644 index 0000000000..120b2230ce --- /dev/null +++ b/SYCL/NonUniformGroups/is_user_constructed.cpp @@ -0,0 +1,14 @@ +// RUN: %clangxx -fsycl fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out + +#include +namespace syclex = sycl::ext::oneapi::experimental; + +static_assert( + syclex::is_user_constructed_group_v>); +static_assert(syclex::is_user_constructed_group_v< + syclex::cluster_group<1, sycl::sub_group>>); +static_assert(syclex::is_user_constructed_group_v< + syclex::cluster_group<2, sycl::sub_group>>); +static_assert( + syclex::is_user_constructed_group_v>); +static_assert(syclex::is_user_constructed_group_v); diff --git a/SYCL/NonUniformGroups/opportunistic_group.cpp b/SYCL/NonUniformGroups/opportunistic_group.cpp new file mode 100644 index 0000000000..56e3008cec --- /dev/null +++ b/SYCL/NonUniformGroups/opportunistic_group.cpp @@ -0,0 +1,69 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +class TestKernel; + +int main() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + return 0; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Due to the unpredictable runtime behavior of opportunistic groups, + // some values may change from run to run. Check they're in expected + // ranges and consistent with other groups. + if (item.get_global_id() % 2 == 0) { + auto OpportunisticGroup = + syclex::this_kernel::get_opportunistic_group(); + + bool Match = true; + Match &= (OpportunisticGroup.get_group_id() == 0); + Match &= (OpportunisticGroup.get_local_id() < + OpportunisticGroup.get_local_range()); + Match &= (OpportunisticGroup.get_group_range() == 1); + Match &= (OpportunisticGroup.get_local_linear_range() <= + SG.get_local_linear_range()); + MatchAcc[WI] = Match; + LeaderAcc[WI] = OpportunisticGroup.leader(); + } + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + uint32_t NumLeaders = 0; + for (int WI = 0; WI < 32; ++WI) { + if (WI % 2 == 0) { + assert(MatchAcc[WI] == true); + if (LeaderAcc[WI]) { + NumLeaders++; + } + } + } + assert(NumLeaders > 0); + return 0; +} diff --git a/SYCL/NonUniformGroups/tangle_group.cpp b/SYCL/NonUniformGroups/tangle_group.cpp new file mode 100644 index 0000000000..5f7325139b --- /dev/null +++ b/SYCL/NonUniformGroups/tangle_group.cpp @@ -0,0 +1,69 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +class TestKernel; + +int main() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + return 0; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Split into odd and even work-items via control flow + // Branches deliberately duplicated to test impact of optimizations + if (item.get_global_id() % 2 == 0) { + auto TangleGroup = syclex::get_tangle_group(SG); + + bool Match = true; + Match &= (TangleGroup.get_group_id() == 0); + Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2); + Match &= (TangleGroup.get_group_range() == 1); + Match &= (TangleGroup.get_local_range() == 16); + MatchAcc[WI] = Match; + LeaderAcc[WI] = TangleGroup.leader(); + } else { + auto TangleGroup = syclex::get_tangle_group(SG); + + bool Match = true; + Match &= (TangleGroup.get_group_id() == 0); + Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2); + Match &= (TangleGroup.get_group_range() == 1); + Match &= (TangleGroup.get_local_range() == 16); + MatchAcc[WI] = Match; + LeaderAcc[WI] = TangleGroup.leader(); + } + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == (WI < 2)); + } + return 0; +} From cb19d7b941cada11cae64ee92e8e18acbab36b7e Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 7 Mar 2023 13:03:38 -0800 Subject: [PATCH 2/4] Fix typo in -fsyntax-only flag Signed-off-by: John Pennycook --- SYCL/NonUniformGroups/is_fixed_topology.cpp | 2 +- SYCL/NonUniformGroups/is_user_constructed.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/SYCL/NonUniformGroups/is_fixed_topology.cpp b/SYCL/NonUniformGroups/is_fixed_topology.cpp index f02f8da6ae..b3b6cd5ba4 100644 --- a/SYCL/NonUniformGroups/is_fixed_topology.cpp +++ b/SYCL/NonUniformGroups/is_fixed_topology.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out #include namespace syclex = sycl::ext::oneapi::experimental; diff --git a/SYCL/NonUniformGroups/is_user_constructed.cpp b/SYCL/NonUniformGroups/is_user_constructed.cpp index 120b2230ce..a3f0085d8e 100644 --- a/SYCL/NonUniformGroups/is_user_constructed.cpp +++ b/SYCL/NonUniformGroups/is_user_constructed.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out #include namespace syclex = sycl::ext::oneapi::experimental; From 193803fbfa16e41aa20929c6eda4a5e43bb8a9a0 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Wed, 8 Mar 2023 08:35:40 -0800 Subject: [PATCH 3/4] Add CPU to unsupported platforms --- SYCL/NonUniformGroups/ballot_group.cpp | 3 +-- SYCL/NonUniformGroups/cluster_group.cpp | 3 +-- SYCL/NonUniformGroups/opportunistic_group.cpp | 3 +-- SYCL/NonUniformGroups/tangle_group.cpp | 3 +-- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/SYCL/NonUniformGroups/ballot_group.cpp b/SYCL/NonUniformGroups/ballot_group.cpp index 903de7689b..955744b390 100644 --- a/SYCL/NonUniformGroups/ballot_group.cpp +++ b/SYCL/NonUniformGroups/ballot_group.cpp @@ -1,8 +1,7 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: cpu || cuda || hip #include #include diff --git a/SYCL/NonUniformGroups/cluster_group.cpp b/SYCL/NonUniformGroups/cluster_group.cpp index 72a38baca2..e1d7634191 100644 --- a/SYCL/NonUniformGroups/cluster_group.cpp +++ b/SYCL/NonUniformGroups/cluster_group.cpp @@ -1,8 +1,7 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: cpu || cuda || hip #include #include diff --git a/SYCL/NonUniformGroups/opportunistic_group.cpp b/SYCL/NonUniformGroups/opportunistic_group.cpp index 56e3008cec..925340cee1 100644 --- a/SYCL/NonUniformGroups/opportunistic_group.cpp +++ b/SYCL/NonUniformGroups/opportunistic_group.cpp @@ -1,8 +1,7 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: cpu || cuda || hip #include #include diff --git a/SYCL/NonUniformGroups/tangle_group.cpp b/SYCL/NonUniformGroups/tangle_group.cpp index 5f7325139b..bf102313d0 100644 --- a/SYCL/NonUniformGroups/tangle_group.cpp +++ b/SYCL/NonUniformGroups/tangle_group.cpp @@ -1,8 +1,7 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: cpu || cuda || hip #include #include From 99d088170a078df177aade586794e0c8e794a148 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 14 Mar 2023 10:39:20 -0700 Subject: [PATCH 4/4] Disable optimizations in tangle_group test --- SYCL/NonUniformGroups/tangle_group.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/SYCL/NonUniformGroups/tangle_group.cpp b/SYCL/NonUniformGroups/tangle_group.cpp index bf102313d0..172a73ebdc 100644 --- a/SYCL/NonUniformGroups/tangle_group.cpp +++ b/SYCL/NonUniformGroups/tangle_group.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fno-sycl-early-optimizations -fsycl-targets=%sycl_triple %s -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // // UNSUPPORTED: cpu || cuda || hip @@ -33,6 +33,7 @@ int main() { // Split into odd and even work-items via control flow // Branches deliberately duplicated to test impact of optimizations + // This only reliably works with optimizations disabled right now if (item.get_global_id() % 2 == 0) { auto TangleGroup = syclex::get_tangle_group(SG);