intel · steffenlarsen · Mar 27, 2023 · Mar 27, 2023
@@ -49,8 +49,9 @@ int test(queue &Q, T Identity) {
 
   // Initialize.
   BinaryOperation BOp;
-  T CorrectOut;
-  initInputData(InBuf, CorrectOut, BOp, NWorkItems);
+  std::optional<T> CorrectOutOpt;
+  initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
+  T CorrectOut = *CorrectOutOpt;
 
   // Compute.
   Q.submit([&](handler &CGH) {

@@ -48,7 +48,9 @@ struct Red {
   }
 
   void init() {
-    initInputData(InBuf, CorrectOut, BOp, NWorkItems);
+    std::optional<T> CorrectOutOpt;
+    initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
+    CorrectOut = *CorrectOutOpt;
     if (!PropList.template has_property<
             property::reduction::initialize_to_identity>())
       CorrectOut = BOp(CorrectOut, InitVal);

@@ -0,0 +1,66 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+// Group algorithms are not supported on Nvidia.
+// XFAIL: hip_nvidia
+
+// This test performs basic checks of parallel_for(nd_range, reduction, func)
+// with reductions initialized with a one element buffer. Additionally, some
+// reducers will not be written to.
+
+#include "reduction_utils.hpp"
+
+using namespace sycl;
+
+int NumErrors = 0;
+
+template <typename T> class SkipEvenName;
+template <typename T> class SkipOddName;
+template <typename T> class SkipAllName;
+
+template <typename Name, typename T, class BinaryOperation>
+void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t WGSize,
+           size_t NWItems) {
+  nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize});
+  NumErrors += test<SkipEvenName<Name>, T>(Q, Identity, Init, BOp, NDRange,
+                                           property_list{}, SkipEvenOp{});
+  NumErrors += test<SkipOddName<Name>, T>(Q, Identity, Init, BOp, NDRange,
+                                          property_list{}, SkipOddOp{});
+  NumErrors += test<SkipAllName<Name>, T>(Q, Identity, Init, BOp, NDRange,
+                                          property_list{}, SkipAllOp{});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+
+  // Check some non power-of-two work-group sizes.
+  tests<class A1, int>(Q, 0, 99, std::plus<int>{}, 1, 7);
+  tests<class A2, int>(Q, 0, 99, std::plus<int>{}, 49, 49 * 5);
+
+  // Try some power-of-two work-group sizes.
+  tests<class B1, int>(Q, 0, 99, std::plus<>{}, 1, 32);
+  tests<class B2, int>(Q, 1, 99, std::multiplies<>{}, 4, 32);
+  tests<class B3, int>(Q, 0, 99, std::bit_or<>{}, 8, 128);
+  tests<class B4, int>(Q, 0, 99, std::bit_xor<>{}, 16, 256);
+  tests<class B5, int>(Q, ~0, 99, std::bit_and<>{}, 32, 256);
+  tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
+                       ext::oneapi::minimum<>{}, 64, 256);
+  tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
+                       ext::oneapi::maximum<>{}, 128, 256);
+  tests<class B8, int>(Q, 0, 99, std::plus<>{}, 256, 256);
+
+  // Check with various types.
+  tests<class C1, float>(Q, 1, 99, std::multiplies<>{}, 8, 24);
+  tests<class C2, short>(Q, 0x7fff, -99, ext::oneapi::minimum<>{}, 8, 256);
+  tests<class C3, unsigned char>(Q, 0, 99, ext::oneapi::maximum<>{}, 8, 256);
+
+  // Check with CUSTOM type.
+  using CV = CustomVec<long long>;
+  tests<class D1, CV>(Q, CV(0), CV(99), CustomVecPlus<long long>{}, 8, 256);
+
+  printFinalStatus(NumErrors);
+  return NumErrors;
+}
@@ -0,0 +1,65 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<1>, reduction, func)
+// with reductions initialized with a one element buffer. Additionally, some
+// reducers will not be written to.
+
+#include "reduction_utils.hpp"
+
+using namespace sycl;
+
+int NumErrors = 0;
+
+template <typename T> class SkipEvenName;
+template <typename T> class SkipOddName;
+template <typename T> class SkipAllName;
+
+template <typename Name, typename T, typename... ArgTys>
+void tests(ArgTys &&...Args) {
+  NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                           property_list{}, SkipEvenOp{});
+  NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                          property_list{}, SkipOddOp{});
+  NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                          property_list{}, SkipAllOp{});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  constexpr access::mode RW = access::mode::read_write;
+  // Fast-reduce and Fast-atomics. Try various range types/sizes.
+  tests<class A1, int>(Q, 0, 99, std::plus<int>{}, range<1>(1));
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<1>(2));
+  tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<1>(7));
+  tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<1>(64));
+  tests<class A5, int>(Q, 0, 99, std::plus<>{}, range<1>(MaxWGSize * 2));
+  tests<class A6, int>(Q, 0, 99, std::plus<>{}, range<1>(MaxWGSize * 2 + 5));
+
+  // Check with CUSTOM type.
+  tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
+                                        range<1>(256));
+  tests<class B2, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
+                                        range<1>(MaxWGSize * 3));
+  tests<class B3, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
+                                        range<1>(72));
+
+  // Check with identityless operations.
+  tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(1));
+  tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(2));
+  tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(7));
+  tests<class C4, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(64));
+  tests<class C5, int>(Q, 99, PlusWithoutIdentity<int>{},
+                       range<1>(MaxWGSize * 2));
+  tests<class C6, int>(Q, 99, PlusWithoutIdentity<int>{},
+                       range<1>(MaxWGSize * 2 + 5));
+
+  printFinalStatus(NumErrors);
+  return NumErrors;
+}
@@ -0,0 +1,69 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<2>, reduction, func)
+// with reductions initialized with a one element buffer. Additionally, some
+// reducers will not be written to.
+
+#include "reduction_utils.hpp"
+
+using namespace sycl;
+
+int NumErrors = 0;
+
+template <typename T> class SkipEvenName;
+template <typename T> class SkipOddName;
+template <typename T> class SkipAllName;
+
+template <typename Name, typename T, typename... ArgTys>
+void tests(ArgTys &&...Args) {
+  NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                           property_list{}, SkipEvenOp{});
+  NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                          property_list{}, SkipOddOp{});
+  NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                          property_list{}, SkipAllOp{});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<2>{1, 1});
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 2});
+  tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 3});
+  tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize, 1});
+  tests<class A5, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{1, MaxWGSize});
+  tests<class A6, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{2, MaxWGSize * 2});
+  tests<class A7, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize * 3, 7});
+  tests<class A8, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{3, MaxWGSize * 3});
+
+  tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
+                                        range<2>{33, MaxWGSize});
+  tests<class B2, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
+                                        range<2>{33, MaxWGSize});
+
+  tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{1, 1});
+  tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{2, 2});
+  tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{2, 3});
+  tests<class C4, int>(Q, 99, PlusWithoutIdentity<int>{},
+                       range<2>{MaxWGSize, 1});
+  tests<class C5, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<2>{1, MaxWGSize});
+  tests<class C6, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<2>{2, MaxWGSize * 2});
+  tests<class C7, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<2>{MaxWGSize * 3, 7});
+  tests<class C8, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<2>{3, MaxWGSize * 3});
+
+  printFinalStatus(NumErrors);
+  return NumErrors;
+}
@@ -0,0 +1,82 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<3>, reduction, func)
+// with reductions initialized with a one element buffer. Additionally, some
+// reducers will not be written to.
+
+#include "reduction_utils.hpp"
+
+using namespace sycl;
+
+int NumErrors = 0;
+
+template <typename T> class SkipEvenName;
+template <typename T> class SkipOddName;
+template <typename T> class SkipAllName;
+
+template <typename Name, typename T, typename... ArgTys>
+void tests(ArgTys &&...Args) {
+  NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                           property_list{}, SkipEvenOp{});
+  NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                          property_list{}, SkipOddOp{});
+  NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                          property_list{}, SkipAllOp{});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<3>{1, 1, 1});
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<3>{2, 2, 2});
+  tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<3>{2, 3, 4});
+
+  tests<class A4, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{1, 1, MaxWGSize + 1});
+  tests<class A5, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{1, MaxWGSize + 1, 1});
+  tests<class A6, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{MaxWGSize + 1, 1, 1});
+
+  tests<class A7, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{2, 5, MaxWGSize * 2});
+  tests<class A8, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{3, MaxWGSize * 3, 2});
+  tests<class A9, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{MaxWGSize * 3, 8, 4});
+
+  tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
+                                        range<3>{2, 33, MaxWGSize});
+  tests<class B2, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
+                                        range<3>{2, 33, MaxWGSize});
+
+  tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{1, 1, 1});
+  tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 2, 2});
+  tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 3, 4});
+
+  tests<class C4, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<3>{1, 1, MaxWGSize + 1});
+  tests<class C5, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<3>{1, MaxWGSize + 1, 1});
+  tests<class C6, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<3>{MaxWGSize + 1, 1, 1});
+
+  tests<class C7, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<3>{2, 5, MaxWGSize * 2});
+  tests<class C8, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<3>{3, MaxWGSize * 3, 2});
+  tests<class C9, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<3>{MaxWGSize * 3, 8, 4});
+
+  printFinalStatus(NumErrors);
+  return NumErrors;
+}
@@ -48,7 +48,9 @@ struct Red {
   }
 
   void init() {
-    initInputData(InBuf, CorrectOut, BOp, NWorkItems);
+    std::optional<T> CorrectOutOpt;
+    initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
+    CorrectOut = *CorrectOutOpt;
     if (!PropList.template has_property<
             property::reduction::initialize_to_identity>())
       CorrectOut = BOp(CorrectOut, InitVal);

@@ -38,12 +38,12 @@ int test(queue &Q, OptionalIdentity<T, HasIdentity> Identity, T Init,
   }
 
   // Initialize.
-  T CorrectOut;
+  std::optional<T> CorrectOutOpt;
   BinaryOperation BOp;
 
   buffer<T, 1> InBuf(NWItems);
-  initInputData(InBuf, CorrectOut, BOp, NWItems);
-  CorrectOut = BOp(CorrectOut, Init);
+  initInputData(InBuf, CorrectOutOpt, BOp, NWItems);
+  T CorrectOut = BOp(*CorrectOutOpt, Init);
 
   // Compute.
   Q.submit([&](handler &CGH) {

@@ -39,11 +39,12 @@ int test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems,
   }
 
   // Initialize.
-  T CorrectOut;
+  std::optional<T> CorrectOutOpt;
   BinaryOperation BOp;
 
   buffer<T, 1> InBuf(NWItems);
-  initInputData(InBuf, CorrectOut, BOp, NWItems);
+  initInputData(InBuf, CorrectOutOpt, BOp, NWItems);
+  T CorrectOut = *CorrectOutOpt;
 
   // Compute.
   Q.submit([&](handler &CGH) {