[SYCL] Add run_on_host_intel method to handler.

romanovvlad · romanovvlad · commit 5e16cf42c61a · 2019-09-05T16:56:02.000+03:00
run_on_host_intel allows inserting task which runs regular host code into a
SYCL DAG.

Signed-off-by: Vlad Romanov &lt;vlad.romanov@intel.com&gt;
diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp
@@ -325,7 +325,8 @@ class CG {
     COPY_PTR_TO_ACC,
     COPY_ACC_TO_ACC,
     FILL,
-    UPDATE_HOST
+    UPDATE_HOST,
+    RUN_ON_HOST_INTEL
   };
 
   CG(CGTYPE Type, std::vector<std::vector<char>> ArgsStorage,
@@ -340,10 +341,6 @@ class CG {
 
   CG(CG &&CommandGroup) = default;
 
-  std::vector<Requirement *> getRequirements() const { return MRequirements; }
-
-  std::vector<detail::EventImplPtr> getEvents() const { return MEvents; }
-
   CGTYPE getType() { return MType; }
 
   virtual ~CG() = default;
@@ -358,6 +355,8 @@ class CG {
   std::vector<detail::AccessorImplPtr> MAccStorage;
   // Storage for shared_ptrs.
   std::vector<std::shared_ptr<const void>> MSharedPtrStorage;
+
+public:
   // List of requirements that specify which memory is needed for the command
   // group to be executed.
   std::vector<Requirement *> MRequirements;
@@ -385,14 +384,18 @@ class CGExecKernel : public CG {
                std::vector<detail::EventImplPtr> Events,
                std::vector<ArgDesc> Args, std::string KernelName,
                detail::OSModuleHandle OSModuleHandle,
-               std::vector<std::shared_ptr<detail::stream_impl>> Streams)
-      : CG(KERNEL, std::move(ArgsStorage), std::move(AccStorage),
+               std::vector<std::shared_ptr<detail::stream_impl>> Streams,
+               CGTYPE Type)
+      : CG(Type, std::move(ArgsStorage), std::move(AccStorage),
            std::move(SharedPtrStorage), std::move(Requirements),
            std::move(Events)),
         MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)),
         MSyclKernel(std::move(SyclKernel)), MArgs(std::move(Args)),
         MKernelName(std::move(KernelName)), MOSModuleHandle(OSModuleHandle),
-        MStreams(std::move(Streams)) {}
+        MStreams(std::move(Streams)) {
+    assert((getType() == RUN_ON_HOST_INTEL || getType() == KERNEL) &&
+           "Wrong type of exec kernel CG.");
+  }
 
   std::vector<ArgDesc> getArguments() const { return MArgs; }
   std::string getKernelName() const { return MKernelName; }
diff --git a/sycl/include/CL/sycl/detail/pi.def b/sycl/include/CL/sycl/detail/pi.def
@@ -78,6 +78,7 @@ _PI_API(piSamplerRetain)
 _PI_API(piSamplerRelease)
 // Queue commands
 _PI_API(piEnqueueKernelLaunch)
+_PI_API(piEnqueueNativeKernel)
 _PI_API(piEnqueueEventsWait)
 _PI_API(piEnqueueMemBufferRead)
 _PI_API(piEnqueueMemBufferReadRect)
diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h
@@ -691,6 +691,18 @@ pi_result piEnqueueKernelLaunch(
   const pi_event *  event_wait_list,
   pi_event *        event);
 
+pi_result piEnqueueNativeKernel(
+  pi_queue         queue,
+  void             (*user_func)(void *),
+  void *           args,
+  size_t           cb_args,
+  pi_uint32        num_mem_objects,
+  const pi_mem *   mem_list,
+  const void **    args_mem_loc,
+  pi_uint32        num_events_in_wait_list,
+  const pi_event * event_wait_list,
+  pi_event *       event);
+
 pi_result piEnqueueEventsWait(
   pi_queue          command_queue,
   pi_uint32         num_events_in_wait_list,
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
@@ -355,12 +355,13 @@ class handler {
     std::unique_ptr<detail::CG> CommandGroup;
     switch (MCGType) {
     case detail::CG::KERNEL:
+    case detail::CG::RUN_ON_HOST_INTEL:
       CommandGroup.reset(new detail::CGExecKernel(
           std::move(MNDRDesc), std::move(MHostKernel), std::move(MSyclKernel),
           std::move(MArgsStorage), std::move(MAccStorage),
           std::move(MSharedPtrStorage), std::move(MRequirements),
           std::move(MEvents), std::move(MArgs), std::move(MKernelName),
-          std::move(MOSModuleHandle), std::move(MStreamStorage)));
+          std::move(MOSModuleHandle), std::move(MStreamStorage), MCGType));
       break;
     case detail::CG::COPY_ACC_TO_PTR:
     case detail::CG::COPY_PTR_TO_ACC:
@@ -671,6 +672,16 @@ class handler {
 #endif
   }
 
+  // Similar to single_task, but passed lambda will be executed on host.
+  template <typename FuncT> void run_on_host_intel(FuncT Func) {
+    MNDRDesc.set(range<1>{1});
+
+    MArgs = std::move(MAssociatedAccesors);
+    MHostKernel.reset(
+        new detail::HostKernel<FuncT, void, 1>(std::move(Func)));
+    MCGType = detail::CG::RUN_ON_HOST_INTEL;
+  }
+
   // parallel_for version with a kernel represented as a lambda + range and
   // offset that specify global size and global offset correspondingly.
   template <typename KernelName = csd::auto_name, typename KernelType, int Dims>
diff --git a/sycl/source/detail/pi_opencl.cpp b/sycl/source/detail/pi_opencl.cpp
@@ -310,6 +310,7 @@ _PI_CL(piSamplerRetain,         clRetainSampler)
 _PI_CL(piSamplerRelease,        clReleaseSampler)
 // Queue commands
 _PI_CL(piEnqueueKernelLaunch,        clEnqueueNDRangeKernel)
+_PI_CL(piEnqueueNativeKernel,        clEnqueueNativeKernel)
 _PI_CL(piEnqueueEventsWait,          clEnqueueMarkerWithWaitList)
 _PI_CL(piEnqueueMemBufferRead,       clEnqueueReadBuffer)
 _PI_CL(piEnqueueMemBufferReadRect,   clEnqueueReadBufferRect)
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
@@ -533,6 +533,20 @@ static void adjustNDRangePerKernel(NDRDescT &NDR, RT::PiKernel Kernel,
   NDR.set(NDR.Dims, nd_range<3>(NDR.NumWorkGroups * WGSize, WGSize));
 }
 
+// The function initialize accessors and calls lambda.
+// The function is used as argument to piEnqueueNativeKernel which requires
+// that the passed function takes one void* argument.
+void DispatchNativeKernel(void *Blob) {
+  // First value is a pointer to Corresponding CGExecKernel object.
+  CGExecKernel *HostTask = *(CGExecKernel **)Blob;
+
+  // Other value are pointer to the buffers.
+  void **NextArg = (void **)Blob + 1;
+  for (detail::Requirement *Req : HostTask->MRequirements)
+    Req->MData = *(NextArg++);
+  HostTask->MHostKernel->call(HostTask->MNDRDesc);
+}
+
 cl_int ExecCGCommand::enqueueImp() {
   std::vector<RT::PiEvent> RawEvents =
       Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context()));
@@ -606,6 +620,68 @@ cl_int ExecCGCommand::enqueueImp() {
                         Event);
     return CL_SUCCESS;
   }
+  case CG::CGTYPE::RUN_ON_HOST_INTEL: {
+    CGExecKernel *HostTask = (CGExecKernel *)MCommandGroup.get();
+
+    // piEnqueueNativeKernel takes arguments blob which is passes to user
+    // function.
+    // Reserve extra space for the pointer to CGExecKernel to restore context.
+    std::vector<void *> ArgsBlob(HostTask->MArgs.size() + 1);
+    ArgsBlob[0] = (void *)HostTask;
+    void **NextArg = ArgsBlob.data() + 1;
+
+    if (MQueue->is_host()) {
+      for (ArgDesc &Arg : HostTask->MArgs) {
+        assert(Arg.MType == kernel_param_kind_t::kind_accessor);
+
+        Requirement *Req = (Requirement *)(Arg.MPtr);
+        AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
+
+        *NextArg = AllocaCmd->getMemAllocation();
+        NextArg++;
+      }
+
+      if (!RawEvents.empty())
+        PI_CALL(RT::piEventsWait(RawEvents.size(), &RawEvents[0]));
+      DispatchNativeKernel((void*)ArgsBlob.data());
+      return CL_SUCCESS;
+    }
+
+    std::vector<pi_mem> Buffers;
+    // piEnqueueNativeKernel requires additional array of pointers to args blob,
+    // values that pointers point to are replaced with actual pointers to the
+    // memory before execution of user function.
+    std::vector<void*> MemLocs;
+
+    for (ArgDesc &Arg : HostTask->MArgs) {
+      assert(Arg.MType == kernel_param_kind_t::kind_accessor);
+
+      Requirement *Req = (Requirement *)(Arg.MPtr);
+      AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
+      pi_mem MemArg = (pi_mem)AllocaCmd->getMemAllocation();
+
+      Buffers.push_back(MemArg);
+      MemLocs.push_back(NextArg);
+      NextArg++;
+    }
+
+    pi_result Error = PI_CALL_RESULT(RT::piEnqueueNativeKernel(
+        MQueue->getHandleRef(), DispatchNativeKernel, (void *)ArgsBlob.data(),
+        HostTask->MArgs[0].MSize, Buffers.size(), Buffers.data(),
+        (const void **)MemLocs.data(), RawEvents.size(),
+        RawEvents.empty() ? nullptr : RawEvents.data(), &Event));
+
+    switch (Error) {
+    case PI_INVALID_OPERATION:
+      throw cl::sycl::runtime_error(
+          "Device doesn't support run_on_host_intel tasks.", Error);
+    case PI_SUCCESS:
+      return Error;
+    default:
+      throw cl::sycl::runtime_error(
+          "Enqueueing run_on_host_intel task has failed.", Error);
+    }
+  }
   case CG::CGTYPE::KERNEL: {
     CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
 
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -592,8 +592,8 @@ void Scheduler::GraphBuilder::markModifiedIfWrite(
 Command *
 Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
                                QueueImplPtr Queue) {
-  std::vector<Requirement *> Reqs = CommandGroup->getRequirements();
-  std::vector<detail::EventImplPtr> Events = CommandGroup->getEvents();
+  const std::vector<Requirement *> &Reqs = CommandGroup->MRequirements;
+  const std::vector<detail::EventImplPtr> &Events = CommandGroup->MEvents;
   std::unique_ptr<ExecCGCommand> NewCmd(
       new ExecCGCommand(std::move(CommandGroup), Queue));
   if (!NewCmd)
diff --git a/sycl/test/basic_tests/handler/run_on_host_intel.cpp b/sycl/test/basic_tests/handler/run_on_host_intel.cpp
@@ -0,0 +1,53 @@
+// RUN: %clangxx -fsycl %s -o %t.out -lOpenCL
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+
+//==-- run_on_host_intel.cpp -----------------------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CL/sycl/access/access.hpp"
+#include <CL/sycl.hpp>
+
+#include "../../helpers.hpp"
+
+using namespace cl;
+
+template <typename SrcAccType, typename DstAccType>
+void copyAndAdd(SrcAccType SrcAcc, DstAccType DstAcc, int Var) {
+  for (int I = 0; I < (int)DstAcc.get_count(); ++I)
+    DstAcc[I] = Var + SrcAcc[I];
+}
+
+int main() {
+  constexpr size_t BufSize = 4;
+  int data1[BufSize] = {-1, -1, -1, -1};
+  sycl::buffer<int, 1> SrcBuf(data1, sycl::range<1>{BufSize});
+  sycl::buffer<int, 1> DstBuf(sycl::range<1>{BufSize});
+
+  TestQueue Queue{sycl::default_selector{}};
+  Queue.submit([&](sycl::handler &CGH) {
+    auto SrcAcc = SrcBuf.get_access<sycl::access::mode::read>(CGH);
+    auto DstAcc = DstBuf.get_access<sycl::access::mode::write>(CGH);
+    const int Var = 43;
+
+    CGH.run_on_host_intel([=]() { copyAndAdd(SrcAcc, DstAcc, Var); });
+  });
+
+  auto DstAcc = DstBuf.template get_access<sycl::access::mode::read_write>();
+  const int Expected = 42;
+  for (int I = 0; I < DstAcc.get_count(); ++I)
+    if (DstAcc[I] != Expected) {
+      std::cerr << "Mismatch. Elem " << I << ". Expected: " << Expected
+                << ", Got: " << DstAcc[I] << std::endl;
+      return 1;
+    }
+
+  std::cout << "Success" << std::endl;
+
+  return 0;
+}
diff --git a/sycl/test/basic_tests/image_api.cpp b/sycl/test/basic_tests/image_api.cpp
@@ -130,7 +130,8 @@ int main() {
         std::move(Handler.MAccStorage), std::move(Handler.MSharedPtrStorage),
         std::move(Handler.MRequirements), /*DepsEvents*/ {},
         std::move(Handler.MArgs), std::move(Handler.MKernelName),
-        std::move(Handler.MOSModuleHandle), std::move(Handler.MStreamStorage)));
+        std::move(Handler.MOSModuleHandle), std::move(Handler.MStreamStorage),
+        d::CG::KERNEL));
 
     d::EventImplPtr Event = d::Scheduler::getInstance().addCG(
         std::move(CommandGroup), d::getSyclObjImpl(Queue));