Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL] Add basic tests for non-uniform groups #1574

Open
wants to merge 4 commits into
base: intel
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions SYCL/NonUniformGroups/ballot_group.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
//
// UNSUPPORTED: cpu || cuda || hip

#include <sycl/sycl.hpp>
#include <vector>
namespace syclex = sycl::ext::oneapi::experimental;

class TestKernel;

int main() {
sycl::queue Q;

auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
std::cout << "Test skipped due to missing support for sub-group size 32."
<< std::endl;
return 0;
}

sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};

const auto NDR = sycl::nd_range<1>{32, 32};
Q.submit([&](sycl::handler &CGH) {
sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
const auto KernelFunc =
[=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
auto WI = item.get_global_id();
auto SG = item.get_sub_group();

// Split into odd and even work-items
bool Predicate = item.get_global_id() % 2 == 0;
auto BallotGroup = syclex::get_ballot_group(SG, Predicate);

// Check function return values match Predicate
bool Match = true;
auto GroupID = (Predicate) ? 1 : 0;
Match &= (BallotGroup.get_group_id() == GroupID);
Match &= (BallotGroup.get_local_id() == SG.get_local_id() / 2);
Match &= (BallotGroup.get_group_range() == 2);
Match &= (BallotGroup.get_local_range() == 16);
MatchAcc[WI] = Match;
LeaderAcc[WI] = BallotGroup.leader();
};
CGH.parallel_for<TestKernel>(NDR, KernelFunc);
});

sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
for (int WI = 0; WI < 32; ++WI) {
assert(MatchAcc[WI] == true);
assert(LeaderAcc[WI] == (WI < 2));
}
return 0;
}
62 changes: 62 additions & 0 deletions SYCL/NonUniformGroups/cluster_group.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
//
// UNSUPPORTED: cpu || cuda || hip

#include <sycl/sycl.hpp>
#include <vector>
namespace syclex = sycl::ext::oneapi::experimental;

template <size_t ClusterSize> class TestKernel;

template <size_t ClusterSize> void test() {
sycl::queue Q;

auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
std::cout << "Test skipped due to missing support for sub-group size 32."
<< std::endl;
}

sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};

const auto NDR = sycl::nd_range<1>{32, 32};
Q.submit([&](sycl::handler &CGH) {
sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
const auto KernelFunc =
[=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
auto WI = item.get_global_id();
auto SG = item.get_sub_group();

auto ClusterGroup = syclex::get_cluster_group<ClusterSize>(SG);

bool Match = true;
Match &= (ClusterGroup.get_group_id() == (WI / ClusterSize));
Match &= (ClusterGroup.get_local_id() == (WI % ClusterSize));
Match &= (ClusterGroup.get_group_range() == (32 / ClusterSize));
Match &= (ClusterGroup.get_local_range() == ClusterSize);
MatchAcc[WI] = Match;
LeaderAcc[WI] = ClusterGroup.leader();
};
CGH.parallel_for<TestKernel<ClusterSize>>(NDR, KernelFunc);
});

sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
for (int WI = 0; WI < 32; ++WI) {
assert(MatchAcc[WI] == true);
assert(LeaderAcc[WI] == ((WI % ClusterSize) == 0));
}
}

int main() {
test<1>();
test<2>();
test<4>();
test<8>();
test<16>();
test<32>();
return 0;
}
12 changes: 12 additions & 0 deletions SYCL/NonUniformGroups/is_fixed_topology.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// RUN: %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out

#include <sycl/sycl.hpp>
namespace syclex = sycl::ext::oneapi::experimental;

#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP
static_assert(syclex::is_fixed_topology_group_v<syclex::root_group>);
#endif
static_assert(syclex::is_fixed_topology_group_v<sycl::group<1>>);
static_assert(syclex::is_fixed_topology_group_v<sycl::group<2>>);
static_assert(syclex::is_fixed_topology_group_v<sycl::group<3>>);
static_assert(syclex::is_fixed_topology_group_v<sycl::sub_group>);
14 changes: 14 additions & 0 deletions SYCL/NonUniformGroups/is_user_constructed.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// RUN: %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out

#include <sycl/sycl.hpp>
namespace syclex = sycl::ext::oneapi::experimental;

static_assert(
syclex::is_user_constructed_group_v<syclex::ballot_group<sycl::sub_group>>);
static_assert(syclex::is_user_constructed_group_v<
syclex::cluster_group<1, sycl::sub_group>>);
static_assert(syclex::is_user_constructed_group_v<
syclex::cluster_group<2, sycl::sub_group>>);
static_assert(
syclex::is_user_constructed_group_v<syclex::tangle_group<sycl::sub_group>>);
static_assert(syclex::is_user_constructed_group_v<syclex::opportunistic_group>);
68 changes: 68 additions & 0 deletions SYCL/NonUniformGroups/opportunistic_group.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
//
// UNSUPPORTED: cpu || cuda || hip

#include <sycl/sycl.hpp>
#include <vector>
namespace syclex = sycl::ext::oneapi::experimental;

class TestKernel;

int main() {
sycl::queue Q;

auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
std::cout << "Test skipped due to missing support for sub-group size 32."
<< std::endl;
return 0;
}

sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};

const auto NDR = sycl::nd_range<1>{32, 32};
Q.submit([&](sycl::handler &CGH) {
sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
const auto KernelFunc =
[=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
auto WI = item.get_global_id();
auto SG = item.get_sub_group();

// Due to the unpredictable runtime behavior of opportunistic groups,
// some values may change from run to run. Check they're in expected
// ranges and consistent with other groups.
if (item.get_global_id() % 2 == 0) {
auto OpportunisticGroup =
syclex::this_kernel::get_opportunistic_group();

bool Match = true;
Match &= (OpportunisticGroup.get_group_id() == 0);
Match &= (OpportunisticGroup.get_local_id() <
OpportunisticGroup.get_local_range());
Match &= (OpportunisticGroup.get_group_range() == 1);
Match &= (OpportunisticGroup.get_local_linear_range() <=
SG.get_local_linear_range());
MatchAcc[WI] = Match;
LeaderAcc[WI] = OpportunisticGroup.leader();
}
};
CGH.parallel_for<TestKernel>(NDR, KernelFunc);
});

sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
uint32_t NumLeaders = 0;
for (int WI = 0; WI < 32; ++WI) {
if (WI % 2 == 0) {
assert(MatchAcc[WI] == true);
if (LeaderAcc[WI]) {
NumLeaders++;
}
}
}
assert(NumLeaders > 0);
return 0;
}
69 changes: 69 additions & 0 deletions SYCL/NonUniformGroups/tangle_group.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// RUN: %clangxx -fsycl -fno-sycl-early-optimizations -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
//
// UNSUPPORTED: cpu || cuda || hip

#include <sycl/sycl.hpp>
#include <vector>
namespace syclex = sycl::ext::oneapi::experimental;

class TestKernel;

int main() {
sycl::queue Q;

auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
std::cout << "Test skipped due to missing support for sub-group size 32."
<< std::endl;
return 0;
}

sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};

const auto NDR = sycl::nd_range<1>{32, 32};
Q.submit([&](sycl::handler &CGH) {
sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
const auto KernelFunc =
[=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
auto WI = item.get_global_id();
auto SG = item.get_sub_group();

// Split into odd and even work-items via control flow
// Branches deliberately duplicated to test impact of optimizations
// This only reliably works with optimizations disabled right now
if (item.get_global_id() % 2 == 0) {
auto TangleGroup = syclex::get_tangle_group(SG);

bool Match = true;
Match &= (TangleGroup.get_group_id() == 0);
Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2);
Match &= (TangleGroup.get_group_range() == 1);
Match &= (TangleGroup.get_local_range() == 16);
MatchAcc[WI] = Match;
LeaderAcc[WI] = TangleGroup.leader();
} else {
auto TangleGroup = syclex::get_tangle_group(SG);

bool Match = true;
Match &= (TangleGroup.get_group_id() == 0);
Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2);
Match &= (TangleGroup.get_group_range() == 1);
Match &= (TangleGroup.get_local_range() == 16);
MatchAcc[WI] = Match;
LeaderAcc[WI] = TangleGroup.leader();
}
};
CGH.parallel_for<TestKernel>(NDR, KernelFunc);
});

sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
for (int WI = 0; WI < 32; ++WI) {
assert(MatchAcc[WI] == true);
assert(LeaderAcc[WI] == (WI < 2));
}
return 0;
}