intel · v-klochkov · Apr 28, 2020 · Apr 24, 2020 · Apr 24, 2020 · Apr 25, 2020
@@ -24,11 +24,7 @@ namespace sycl {
 #else
 namespace __sycl_std = __host_std;
 #endif
-} // namespace sycl
-} // __SYCL_INLINE_NAMESPACE(cl)
 
-__SYCL_INLINE_NAMESPACE(cl) {
-namespace sycl {
 /* ----------------- 4.13.3 Math functions. ---------------------------------*/
 // genfloat acos (genfloat x)
 template <typename T>
@@ -731,7 +727,8 @@ detail::enable_if_t<detail::is_geninteger<T>::value, T> clz(T x) __NOEXC {
 namespace intel {
 // geninteger ctz (geninteger x)
 template <typename T>
-detail::enable_if_t<detail::is_geninteger<T>::value, T> ctz(T x) __NOEXC {
+sycl::detail::enable_if_t<sycl::detail::is_geninteger<T>::value, T>
+ctz(T x) __NOEXC {
   return __sycl_std::__invoke_ctz<T>(x);
 }
 } // namespace intel

@@ -46,7 +46,7 @@ class interop_handler {
 
 public:
   using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
-  using ReqToMem = std::pair<detail::Requirement*, pi_mem>;
+  using ReqToMem = std::pair<detail::Requirement *, pi_mem>;
 
   interop_handler(std::vector<ReqToMem> MemObjs, QueueImplPtr Queue)
       : MQueue(std::move(Queue)), MMemObjs(std::move(MemObjs)) {}

@@ -81,7 +81,7 @@ device_func_ptr_holder_t get_device_func_ptr(FuncType F, const char *FuncName,
         PI_INVALID_OPERATION);
   }
 
-  return detail::getDeviceFunctionPointerImpl(D, P, FuncName);
+  return sycl::detail::getDeviceFunctionPointerImpl(D, P, FuncName);
 }
 } // namespace intel
 } // namespace sycl

@@ -342,13 +342,39 @@ class reduction_impl {
            "Only scalar/1-element reductions are supported now.");
   }
 
+  accessor<T, buffer_dim, access::mode::discard_read_write,
+           access::target::local>
+  getReadWriteLocalAcc(size_t Size, handler &CGH) {
+    return accessor<T, buffer_dim, access::mode::discard_read_write,
+                    access::target::local>(Size, CGH);
+  }
+
+  accessor<T, buffer_dim, access::mode::read>
+  getReadAccToPreviousPartialReds(handler &CGH) const {
+    CGH.addReduction(MOutBufPtr);
+    return accessor<T, buffer_dim, access::mode::read>(*MOutBufPtr, CGH);
+  }
+
+  accessor_type getWriteAccForPartialReds(size_t Size, size_t RunNumber,
+                                          handler &CGH) {
+    if (Size == 1) {
+      if (RunNumber > 0)
+        CGH.associateWithHandler(this->MAcc);
+      return this->MAcc;
+    }
+    // Create a new output buffer and return an accessor to it.
+    MOutBufPtr = std::make_shared<buffer<T, buffer_dim>>(range<1>(Size));
+    CGH.addReduction(MOutBufPtr);
+    return accessor_type(*MOutBufPtr, CGH);
+  }
   /// User's accessor to where the reduction must be written.
   accessor_type MAcc;
 
 private:
   /// Identity of the BinaryOperation.
   /// The result of BinaryOperation(X, MIdentity) is equal to X for any X.
   const T MIdentity;
+  shared_ptr_class<buffer<T, buffer_dim>> MOutBufPtr;
 };
 
 } // namespace detail

@@ -351,6 +351,11 @@ class queue_impl {
   /// \return a native handle.
   pi_native_handle getNative() const;
 
+  /// Stores an event that should be associated with the queue
+  ///
+  /// \param Event is the event to be stored
+  void addEvent(event Event);
+
 private:
   /// Performs command group submission to the queue.
   ///
@@ -362,8 +367,9 @@ class queue_impl {
                     shared_ptr_class<queue_impl> Self,
                     const detail::code_location &Loc) {
     handler Handler(std::move(Self), MHostQueue);
+    Handler.saveCodeLoc(Loc);
     CGF(Handler);
-    event Event = Handler.finalize(Loc);
+    event Event = Handler.finalize();
     addEvent(Event);
     return Event;
   }
@@ -377,11 +383,6 @@ class queue_impl {
   void instrumentationEpilog(void *TelementryEvent, string_class &Name,
                              int32_t StreamID, uint64_t IId);
 
-  /// Stores an event that should be associated with the queue
-  ///
-  /// \param Event is the event to be stored
-  void addEvent(event Event);
-
   /// Stores a USM operation event that should be associated with the queue
   ///
   /// \param Event is the event to be stored

@@ -13,12 +13,24 @@
 #include <CL/sycl/handler.hpp>
 #include <CL/sycl/info/info_desc.hpp>
 #include <detail/kernel_impl.hpp>
+#include <detail/queue_impl.hpp>
 #include <detail/scheduler/scheduler.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
-event handler::finalize(const cl::sycl::detail::code_location &Payload) {
-  sycl::event EventRet;
+
+void handler::addEventToQueue(shared_ptr_class<detail::queue_impl> Queue,
+                              cl::sycl::event Event) {
+  Queue->addEvent(std::move(Event));
+}
+
+event handler::finalize() {
+  // This block of code is needed only for reduction implementation.
+  // It is harmless (does nothing) for everything else.
+  if (MIsFinalized)
+    return MLastEvent;
+  MIsFinalized = true;
+
   unique_ptr_class<detail::CG> CommandGroup;
   switch (MCGType) {
   case detail::CG::KERNEL:
@@ -29,52 +41,52 @@ event handler::finalize(const cl::sycl::detail::code_location &Payload) {
         std::move(MSharedPtrStorage), std::move(MRequirements),
         std::move(MEvents), std::move(MArgs), std::move(MKernelName),
         std::move(MOSModuleHandle), std::move(MStreamStorage), MCGType,
-        Payload));
+        MCodeLoc));
     break;
   }
   case detail::CG::INTEROP_TASK_CODEPLAY:
     CommandGroup.reset(new detail::CGInteropTask(
         std::move(MInteropTask), std::move(MArgsStorage),
         std::move(MAccStorage), std::move(MSharedPtrStorage),
-        std::move(MRequirements), std::move(MEvents), MCGType, Payload));
+        std::move(MRequirements), std::move(MEvents), MCGType, MCodeLoc));
     break;
   case detail::CG::COPY_ACC_TO_PTR:
   case detail::CG::COPY_PTR_TO_ACC:
   case detail::CG::COPY_ACC_TO_ACC:
     CommandGroup.reset(new detail::CGCopy(
         MCGType, MSrcPtr, MDstPtr, std::move(MArgsStorage),
         std::move(MAccStorage), std::move(MSharedPtrStorage),
-        std::move(MRequirements), std::move(MEvents), Payload));
+        std::move(MRequirements), std::move(MEvents), MCodeLoc));
     break;
   case detail::CG::FILL:
     CommandGroup.reset(new detail::CGFill(
         std::move(MPattern), MDstPtr, std::move(MArgsStorage),
         std::move(MAccStorage), std::move(MSharedPtrStorage),
-        std::move(MRequirements), std::move(MEvents), Payload));
+        std::move(MRequirements), std::move(MEvents), MCodeLoc));
     break;
   case detail::CG::UPDATE_HOST:
     CommandGroup.reset(new detail::CGUpdateHost(
         MDstPtr, std::move(MArgsStorage), std::move(MAccStorage),
         std::move(MSharedPtrStorage), std::move(MRequirements),
-        std::move(MEvents), Payload));
+        std::move(MEvents), MCodeLoc));
     break;
   case detail::CG::COPY_USM:
     CommandGroup.reset(new detail::CGCopyUSM(
         MSrcPtr, MDstPtr, MLength, std::move(MArgsStorage),
         std::move(MAccStorage), std::move(MSharedPtrStorage),
-        std::move(MRequirements), std::move(MEvents), Payload));
+        std::move(MRequirements), std::move(MEvents), MCodeLoc));
     break;
   case detail::CG::FILL_USM:
     CommandGroup.reset(new detail::CGFillUSM(
         std::move(MPattern), MDstPtr, MLength, std::move(MArgsStorage),
         std::move(MAccStorage), std::move(MSharedPtrStorage),
-        std::move(MRequirements), std::move(MEvents), Payload));
+        std::move(MRequirements), std::move(MEvents), MCodeLoc));
     break;
   case detail::CG::PREFETCH_USM:
     CommandGroup.reset(new detail::CGPrefetchUSM(
         MDstPtr, MLength, std::move(MArgsStorage), std::move(MAccStorage),
         std::move(MSharedPtrStorage), std::move(MRequirements),
-        std::move(MEvents), Payload));
+        std::move(MEvents), MCodeLoc));
     break;
   case detail::CG::NONE:
     throw runtime_error("Command group submitted without a kernel or a "
@@ -88,8 +100,8 @@ event handler::finalize(const cl::sycl::detail::code_location &Payload) {
   detail::EventImplPtr Event = detail::Scheduler::getInstance().addCG(
       std::move(CommandGroup), std::move(MQueue));
 
-  EventRet = detail::createSyclObjFromImpl<event>(Event);
-  return EventRet;
+  MLastEvent = detail::createSyclObjFromImpl<event>(Event);
+  return MLastEvent;
 }
 
 void handler::processArg(void *Ptr, const detail::kernel_param_kind_t &Kind,

@@ -3232,7 +3232,8 @@ _ZN2cl4sycl7handler10processArgEPvRKNS0_6detail19kernel_param_kind_tEimRmb
 _ZN2cl4sycl7handler13getKernelNameB5cxx11Ev
 _ZN2cl4sycl7handler18extractArgsAndReqsEv
 _ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tE
-_ZN2cl4sycl7handler8finalizeERKNS0_6detail13code_locationE
+_ZN2cl4sycl7handler15addEventToQueueESt10shared_ptrINS0_6detail10queue_implEENS0_5eventE
+_ZN2cl4sycl7handler8finalizeEv
 _ZN2cl4sycl7program17build_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_
 _ZN2cl4sycl7program19compile_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_
 _ZN2cl4sycl7program22build_with_kernel_nameENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_l

@@ -43,7 +43,11 @@ int main() {
   check_size<device_selector, 8>();
   check_size<event, 16>();
   check_size<gpu_selector, 8>();
-  check_size<handler, 472>();
+#ifdef _MSC_VER
+  check_size<handler, 520>();
+#else
+  check_size<handler, 528>();
+#endif
   check_size<image<1>, 16>();
   check_size<kernel, 16>();
   check_size<platform, 16>();

@@ -0,0 +1,121 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(nd_range, reduction, func)
+// with reduction and conditional increment of the reduction variable.
+
+#include <CL/sycl.hpp>
+#include <cassert>
+
+using namespace cl::sycl;
+
+template <typename T, class BinaryOperation>
+void initInputData(buffer<T, 1> &InBuf, T &ExpectedOut, T Identity,
+                   BinaryOperation BOp, size_t N) {
+  ExpectedOut = Identity;
+  auto In = InBuf.template get_access<access::mode::write>();
+  for (int I = 0; I < N; ++I) {
+    if (std::is_same<BinaryOperation, std::multiplies<T>>::value)
+      In[I] = 1 + (((I % 37) == 0) ? 1 : 0);
+    else
+      In[I] = I + 1 + 1.1;
+
+    if (I < 2)
+      ExpectedOut = BOp(ExpectedOut, 99);
+    else if (I % 3)
+      ExpectedOut = BOp(ExpectedOut, In[I]);
+    else
+      ; // do nothing.
+  }
+};
+
+template <typename T, int Dim, class BinaryOperation>
+class SomeClass;
+
+template <typename T>
+struct Vec {
+  Vec() : X(0), Y(0) {}
+  Vec(T X, T Y) : X(X), Y(Y) {}
+  Vec(T V) : X(V), Y(V) {}
+  bool operator==(const Vec &P) const {
+    return P.X == X && P.Y == Y;
+  }
+  bool operator!=(const Vec &P) const {
+    return !(*this == P);
+  }
+  T X;
+  T Y;
+};
+template <typename T>
+bool operator==(const Vec<T> &A, const Vec<T> &B) {
+  return A.X == B.X && A.Y == B.Y;
+}
+template <typename T>
+std::ostream &operator<<(std::ostream &OS, const Vec<T> &P) {
+  return OS << "(" << P.X << ", " << P.Y << ")";
+}
+
+template <class T>
+struct VecPlus {
+  using P = Vec<T>;
+  P operator()(const P &A, const P &B) const {
+    return P(A.X + B.X, A.Y + B.Y);
+  }
+};
+
+template <typename T, int Dim, class BinaryOperation>
+void test(T Identity, size_t WGSize, size_t NWItems) {
+  buffer<T, 1> InBuf(NWItems);
+  buffer<T, 1> OutBuf(1);
+
+  // Initialize.
+  BinaryOperation BOp;
+  T CorrectOut;
+  initInputData(InBuf, CorrectOut, Identity, BOp, NWItems);
+
+  // Compute.
+  queue Q;
+  Q.submit([&](handler &CGH) {
+    auto In = InBuf.template get_access<access::mode::read>(CGH);
+    accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
+        Out(OutBuf, CGH);
+    auto Redu = intel::reduction(Out, Identity, BOp);
+
+    range<1> GlobalRange(NWItems);
+    range<1> LocalRange(WGSize);
+    nd_range<1> NDRange(GlobalRange, LocalRange);
+    CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
+        NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
+          size_t I = NDIt.get_global_linear_id();
+          if (I < 2)
+            Sum.combine(T(99));
+          else if (I % 3)
+            Sum.combine(In[I]);
+          else
+            ; // do nothing.
+        });
+  });
+
+  // Check correctness.
+  auto Out = OutBuf.template get_access<access::mode::read>();
+  T ComputedOut = *(Out.get_pointer());
+  if (ComputedOut != CorrectOut) {
+    std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n";
+    std::cout << "Computed value: " << ComputedOut
+              << ", Expected value: " << CorrectOut << "\n";
+    assert(0 && "Wrong value.");
+  }
+}
+
+int main() {
+  test<int, 0, intel::plus<int>>(0, 2, 2);
+  test<int, 1, intel::plus<int>>(0, 7, 7);
+  test<int, 0, intel::plus<int>>(0, 2, 64);
+  test<short, 1, intel::plus<short>>(0, 16, 256);
+
+  std::cout << "Test passed\n";
+  return 0;
+}