Skip to content
This repository has been archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL] Add tests for reductions skipping reducer combines #1697

Open
wants to merge 2 commits into
base: intel
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions SYCL/Reduction/reduction_big_data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,9 @@ int test(queue &Q, T Identity) {

// Initialize.
BinaryOperation BOp;
T CorrectOut;
initInputData(InBuf, CorrectOut, BOp, NWorkItems);
std::optional<T> CorrectOutOpt;
initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
T CorrectOut = *CorrectOutOpt;

// Compute.
Q.submit([&](handler &CGH) {
Expand Down
4 changes: 3 additions & 1 deletion SYCL/Reduction/reduction_nd_N_vars.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ struct Red {
}

void init() {
initInputData(InBuf, CorrectOut, BOp, NWorkItems);
std::optional<T> CorrectOutOpt;
initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
CorrectOut = *CorrectOutOpt;
if (!PropList.template has_property<
property::reduction::initialize_to_identity>())
CorrectOut = BOp(CorrectOut, InitVal);
Expand Down
66 changes: 66 additions & 0 deletions SYCL/Reduction/reduction_nd_reducer_skip.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out
//
// Group algorithms are not supported on Nvidia.
// XFAIL: hip_nvidia

// This test performs basic checks of parallel_for(nd_range, reduction, func)
// with reductions initialized with a one element buffer. Additionally, some
// reducers will not be written to.

#include "reduction_utils.hpp"

using namespace sycl;

int NumErrors = 0;

template <typename T> class SkipEvenName;
template <typename T> class SkipOddName;
template <typename T> class SkipAllName;

template <typename Name, typename T, class BinaryOperation>
void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t WGSize,
size_t NWItems) {
nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize});
NumErrors += test<SkipEvenName<Name>, T>(Q, Identity, Init, BOp, NDRange,
property_list{}, SkipEvenOp{});
NumErrors += test<SkipOddName<Name>, T>(Q, Identity, Init, BOp, NDRange,
property_list{}, SkipOddOp{});
NumErrors += test<SkipAllName<Name>, T>(Q, Identity, Init, BOp, NDRange,
property_list{}, SkipAllOp{});
}

int main() {
queue Q;
printDeviceInfo(Q);

// Check some non power-of-two work-group sizes.
tests<class A1, int>(Q, 0, 99, std::plus<int>{}, 1, 7);
tests<class A2, int>(Q, 0, 99, std::plus<int>{}, 49, 49 * 5);

// Try some power-of-two work-group sizes.
tests<class B1, int>(Q, 0, 99, std::plus<>{}, 1, 32);
tests<class B2, int>(Q, 1, 99, std::multiplies<>{}, 4, 32);
tests<class B3, int>(Q, 0, 99, std::bit_or<>{}, 8, 128);
tests<class B4, int>(Q, 0, 99, std::bit_xor<>{}, 16, 256);
tests<class B5, int>(Q, ~0, 99, std::bit_and<>{}, 32, 256);
tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
ext::oneapi::minimum<>{}, 64, 256);
tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
ext::oneapi::maximum<>{}, 128, 256);
tests<class B8, int>(Q, 0, 99, std::plus<>{}, 256, 256);

// Check with various types.
tests<class C1, float>(Q, 1, 99, std::multiplies<>{}, 8, 24);
tests<class C2, short>(Q, 0x7fff, -99, ext::oneapi::minimum<>{}, 8, 256);
tests<class C3, unsigned char>(Q, 0, 99, ext::oneapi::maximum<>{}, 8, 256);

// Check with CUSTOM type.
using CV = CustomVec<long long>;
tests<class D1, CV>(Q, CV(0), CV(99), CustomVecPlus<long long>{}, 8, 256);

printFinalStatus(NumErrors);
return NumErrors;
}
65 changes: 65 additions & 0 deletions SYCL/Reduction/reduction_range_1d_reducer_skip.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out

// This test performs basic checks of parallel_for(range<1>, reduction, func)
// with reductions initialized with a one element buffer. Additionally, some
// reducers will not be written to.

#include "reduction_utils.hpp"

using namespace sycl;

int NumErrors = 0;

template <typename T> class SkipEvenName;
template <typename T> class SkipOddName;
template <typename T> class SkipAllName;

template <typename Name, typename T, typename... ArgTys>
void tests(ArgTys &&...Args) {
NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipEvenOp{});
NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipOddOp{});
NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipAllOp{});
}

int main() {
queue Q;
printDeviceInfo(Q);
size_t MaxWGSize =
Q.get_device().get_info<info::device::max_work_group_size>();

constexpr access::mode RW = access::mode::read_write;
// Fast-reduce and Fast-atomics. Try various range types/sizes.
tests<class A1, int>(Q, 0, 99, std::plus<int>{}, range<1>(1));
tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<1>(2));
tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<1>(7));
tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<1>(64));
tests<class A5, int>(Q, 0, 99, std::plus<>{}, range<1>(MaxWGSize * 2));
tests<class A6, int>(Q, 0, 99, std::plus<>{}, range<1>(MaxWGSize * 2 + 5));

// Check with CUSTOM type.
tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
range<1>(256));
tests<class B2, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
range<1>(MaxWGSize * 3));
tests<class B3, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
range<1>(72));

// Check with identityless operations.
tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(1));
tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(2));
tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(7));
tests<class C4, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(64));
tests<class C5, int>(Q, 99, PlusWithoutIdentity<int>{},
range<1>(MaxWGSize * 2));
tests<class C6, int>(Q, 99, PlusWithoutIdentity<int>{},
range<1>(MaxWGSize * 2 + 5));

printFinalStatus(NumErrors);
return NumErrors;
}
69 changes: 69 additions & 0 deletions SYCL/Reduction/reduction_range_2d_dw_reducer_skip.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

// TODO: accelerator may not suport atomics required by the current
// implementation. Enable testing when implementation is fixed.
// RUNx: %ACC_RUN_PLACEHOLDER %t.out

// This test performs basic checks of parallel_for(range<2>, reduction, func)
// with reductions initialized with a one element buffer. Additionally, some
// reducers will not be written to.

#include "reduction_utils.hpp"

using namespace sycl;

int NumErrors = 0;

template <typename T> class SkipEvenName;
template <typename T> class SkipOddName;
template <typename T> class SkipAllName;

template <typename Name, typename T, typename... ArgTys>
void tests(ArgTys &&...Args) {
NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipEvenOp{});
NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipOddOp{});
NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipAllOp{});
}

int main() {
queue Q;
printDeviceInfo(Q);
size_t MaxWGSize =
Q.get_device().get_info<info::device::max_work_group_size>();

tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<2>{1, 1});
tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 2});
tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 3});
tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize, 1});
tests<class A5, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{1, MaxWGSize});
tests<class A6, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{2, MaxWGSize * 2});
tests<class A7, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize * 3, 7});
tests<class A8, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{3, MaxWGSize * 3});

tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
range<2>{33, MaxWGSize});
tests<class B2, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
range<2>{33, MaxWGSize});

tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{1, 1});
tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{2, 2});
tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{2, 3});
tests<class C4, int>(Q, 99, PlusWithoutIdentity<int>{},
range<2>{MaxWGSize, 1});
tests<class C5, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<2>{1, MaxWGSize});
tests<class C6, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<2>{2, MaxWGSize * 2});
tests<class C7, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<2>{MaxWGSize * 3, 7});
tests<class C8, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<2>{3, MaxWGSize * 3});

printFinalStatus(NumErrors);
return NumErrors;
}
82 changes: 82 additions & 0 deletions SYCL/Reduction/reduction_range_3d_rw_reducer_skip.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

// TODO: accelerator may not suport atomics required by the current
// implementation. Enable testing when implementation is fixed.
// RUNx: %ACC_RUN_PLACEHOLDER %t.out

// This test performs basic checks of parallel_for(range<3>, reduction, func)
// with reductions initialized with a one element buffer. Additionally, some
// reducers will not be written to.

#include "reduction_utils.hpp"

using namespace sycl;

int NumErrors = 0;

template <typename T> class SkipEvenName;
template <typename T> class SkipOddName;
template <typename T> class SkipAllName;

template <typename Name, typename T, typename... ArgTys>
void tests(ArgTys &&...Args) {
NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipEvenOp{});
NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipOddOp{});
NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipAllOp{});
}

int main() {
queue Q;
printDeviceInfo(Q);
size_t MaxWGSize =
Q.get_device().get_info<info::device::max_work_group_size>();

tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<3>{1, 1, 1});
tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<3>{2, 2, 2});
tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<3>{2, 3, 4});

tests<class A4, int64_t>(Q, 0, 99, std::plus<>{},
range<3>{1, 1, MaxWGSize + 1});
tests<class A5, int64_t>(Q, 0, 99, std::plus<>{},
range<3>{1, MaxWGSize + 1, 1});
tests<class A6, int64_t>(Q, 0, 99, std::plus<>{},
range<3>{MaxWGSize + 1, 1, 1});

tests<class A7, int64_t>(Q, 0, 99, std::plus<>{},
range<3>{2, 5, MaxWGSize * 2});
tests<class A8, int64_t>(Q, 0, 99, std::plus<>{},
range<3>{3, MaxWGSize * 3, 2});
tests<class A9, int64_t>(Q, 0, 99, std::plus<>{},
range<3>{MaxWGSize * 3, 8, 4});

tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
range<3>{2, 33, MaxWGSize});
tests<class B2, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
range<3>{2, 33, MaxWGSize});

tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{1, 1, 1});
tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 2, 2});
tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 3, 4});

tests<class C4, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{1, 1, MaxWGSize + 1});
tests<class C5, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{1, MaxWGSize + 1, 1});
tests<class C6, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{MaxWGSize + 1, 1, 1});

tests<class C7, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{2, 5, MaxWGSize * 2});
tests<class C8, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{3, MaxWGSize * 3, 2});
tests<class C9, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{MaxWGSize * 3, 8, 4});

printFinalStatus(NumErrors);
return NumErrors;
}
4 changes: 3 additions & 1 deletion SYCL/Reduction/reduction_range_N_vars.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ struct Red {
}

void init() {
initInputData(InBuf, CorrectOut, BOp, NWorkItems);
std::optional<T> CorrectOutOpt;
initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
CorrectOut = *CorrectOutOpt;
if (!PropList.template has_property<
property::reduction::initialize_to_identity>())
CorrectOut = BOp(CorrectOut, InitVal);
Expand Down
6 changes: 3 additions & 3 deletions SYCL/Reduction/reduction_usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@ int test(queue &Q, OptionalIdentity<T, HasIdentity> Identity, T Init,
}

// Initialize.
T CorrectOut;
std::optional<T> CorrectOutOpt;
BinaryOperation BOp;

buffer<T, 1> InBuf(NWItems);
initInputData(InBuf, CorrectOut, BOp, NWItems);
CorrectOut = BOp(CorrectOut, Init);
initInputData(InBuf, CorrectOutOpt, BOp, NWItems);
T CorrectOut = BOp(*CorrectOutOpt, Init);

// Compute.
Q.submit([&](handler &CGH) {
Expand Down
5 changes: 3 additions & 2 deletions SYCL/Reduction/reduction_usm_dw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,12 @@ int test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems,
}

// Initialize.
T CorrectOut;
std::optional<T> CorrectOutOpt;
BinaryOperation BOp;

buffer<T, 1> InBuf(NWItems);
initInputData(InBuf, CorrectOut, BOp, NWItems);
initInputData(InBuf, CorrectOutOpt, BOp, NWItems);
T CorrectOut = *CorrectOutOpt;

// Compute.
Q.submit([&](handler &CGH) {
Expand Down
Loading