@@ -533,6 +533,20 @@ static void adjustNDRangePerKernel(NDRDescT &NDR, RT::PiKernel Kernel,
533533 NDR.set (NDR.Dims , nd_range<3 >(NDR.NumWorkGroups * WGSize, WGSize));
534534}
535535
536+ // The function initialize accessors and calls lambda.
537+ // The function is used as argument to piEnqueueNativeKernel which requires
538+ // that the passed function takes one void* argument.
539+ void DispatchNativeKernel (void *Blob) {
540+ // First value is a pointer to Corresponding CGExecKernel object.
541+ CGExecKernel *HostTask = *(CGExecKernel **)Blob;
542+
543+ // Other value are pointer to the buffers.
544+ void **NextArg = (void **)Blob + 1 ;
545+ for (detail::Requirement *Req : HostTask->MRequirements )
546+ Req->MData = *(NextArg++);
547+ HostTask->MHostKernel ->call (HostTask->MNDRDesc );
548+ }
549+
536550cl_int ExecCGCommand::enqueueImp () {
537551 std::vector<RT::PiEvent> RawEvents =
538552 Command::prepareEvents (detail::getSyclObjImpl (MQueue->get_context ()));
@@ -606,6 +620,68 @@ cl_int ExecCGCommand::enqueueImp() {
606620 Event);
607621 return CL_SUCCESS;
608622 }
623+ case CG::CGTYPE::RUN_ON_HOST_INTEL: {
624+ CGExecKernel *HostTask = (CGExecKernel *)MCommandGroup.get ();
625+
626+ // piEnqueueNativeKernel takes arguments blob which is passes to user
627+ // function.
628+ // Reserve extra space for the pointer to CGExecKernel to restore context.
629+ std::vector<void *> ArgsBlob (HostTask->MArgs .size () + 1 );
630+ ArgsBlob[0 ] = (void *)HostTask;
631+ void **NextArg = ArgsBlob.data () + 1 ;
632+
633+ if (MQueue->is_host ()) {
634+ for (ArgDesc &Arg : HostTask->MArgs ) {
635+ assert (Arg.MType == kernel_param_kind_t ::kind_accessor);
636+
637+ Requirement *Req = (Requirement *)(Arg.MPtr );
638+ AllocaCommandBase *AllocaCmd = getAllocaForReq (Req);
639+
640+ *NextArg = AllocaCmd->getMemAllocation ();
641+ NextArg++;
642+ }
643+
644+ if (!RawEvents.empty ())
645+ PI_CALL (RT::piEventsWait (RawEvents.size (), &RawEvents[0 ]));
646+ DispatchNativeKernel ((void *)ArgsBlob.data ());
647+ return CL_SUCCESS;
648+ }
649+
650+ std::vector<pi_mem> Buffers;
651+ // piEnqueueNativeKernel requires additional array of pointers to args blob,
652+ // values that pointers point to are replaced with actual pointers to the
653+ // memory before execution of user function.
654+ std::vector<void *> MemLocs;
655+
656+ for (ArgDesc &Arg : HostTask->MArgs ) {
657+ assert (Arg.MType == kernel_param_kind_t ::kind_accessor);
658+
659+ Requirement *Req = (Requirement *)(Arg.MPtr );
660+ AllocaCommandBase *AllocaCmd = getAllocaForReq (Req);
661+ pi_mem MemArg = (pi_mem)AllocaCmd->getMemAllocation ();
662+
663+ Buffers.push_back (MemArg);
664+ MemLocs.push_back (NextArg);
665+ NextArg++;
666+ }
667+
668+ pi_result Error = PI_CALL_RESULT (RT::piEnqueueNativeKernel (
669+ MQueue->getHandleRef (), DispatchNativeKernel, (void *)ArgsBlob.data (),
670+ HostTask->MArgs [0 ].MSize , Buffers.size (), Buffers.data (),
671+ (const void **)MemLocs.data (), RawEvents.size (),
672+ RawEvents.empty () ? nullptr : RawEvents.data (), &Event));
673+
674+ switch (Error) {
675+ case PI_INVALID_OPERATION:
676+ throw cl::sycl::runtime_error (
677+ " Device doesn't support run_on_host_intel tasks." , Error);
678+ case PI_SUCCESS:
679+ return Error;
680+ default :
681+ throw cl::sycl::runtime_error (
682+ " Enqueueing run_on_host_intel task has failed." , Error);
683+ }
684+ }
609685 case CG::CGTYPE::KERNEL: {
610686 CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get ();
611687
0 commit comments