diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 321f51e8863c7..6c44cbb43fa63 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -187,11 +187,12 @@ class type_erased_cgfo_ty { public: template - type_erased_cgfo_ty(T &f) - // NOTE: Even if `T` is a pointer to a function, `&f` is a pointer to a + type_erased_cgfo_ty(T &&f) + // NOTE: Even if `f` is a pointer to a function, `&f` is a pointer to a // pointer to a function and as such can be casted to `void *` (pointer to // a function cannot be casted). - : object(static_cast(&f)), invoker_f(&invoker::call) {} + : object(static_cast(&f)), + invoker_f(&invoker>::call) {} ~type_erased_cgfo_ty() = default; type_erased_cgfo_ty(const type_erased_cgfo_ty &) = delete; @@ -3878,14 +3879,6 @@ class HandlerAccess { Handler.parallel_for_impl(Range, Props, Kernel); } - template struct dependent { - using type = T; - }; - template - using dependent_queue_t = typename dependent::type; - template - using dependent_handler_t = typename dependent::type; - // pre/postProcess are used only for reductions right now, but the // abstractions they provide aren't reduction-specific. The main problem they // solve is @@ -3901,71 +3894,16 @@ class HandlerAccess { // inside control group function object (lambda above) so we resort to a // somewhat hacky way of creating multiple `handler`s and manual finalization // of them (instead of the one in `queue::submit`). - // - // Overloads with `queue &q` are provided in case the caller has it created - // already to avoid unnecessary reference count increments associated with - // `handler::getQueue()`. - template - static void preProcess(handler &CGH, dependent_queue_t &q, - FunctorTy Func) { - bool EventNeeded = !q.is_in_order(); - handler AuxHandler(getSyclObjImpl(q), EventNeeded); - AuxHandler.copyCodeLoc(CGH); - std::forward(Func)(AuxHandler); - auto E = AuxHandler.finalize(); - assert(!CGH.MIsFinalized && - "Can't do pre-processing if the command has been enqueued already!"); - if (EventNeeded) - CGH.depends_on(E); - } + __SYCL_EXPORT static void preProcess(handler &CGH, type_erased_cgfo_ty F); + __SYCL_EXPORT static void postProcess(handler &CGH, type_erased_cgfo_ty F); + template - static void preProcess(dependent_handler_t &CGH, - FunctorTy &&Func) { - preProcess(CGH, CGH.getQueue(), std::forward(Func)); + static void preProcess(handler &CGH, FunctorTy &Func) { + preProcess(CGH, type_erased_cgfo_ty{Func}); } template - static void postProcess(dependent_handler_t &CGH, - FunctorTy &&Func) { - // The "hacky" `handler`s manipulation mentioned above and implemented here - // is far from perfect. A better approach would be - // - // bool OrigNeedsEvent = CGH.needsEvent() - // assert(CGH.not_finalized/enqueued()); - // if (!InOrderQueue) - // CGH.setNeedsEvent() - // - // handler PostProcessHandler(Queue, OrigNeedsEvent) - // auto E = CGH.finalize(); // enqueue original or current last - // // post-process - // if (!InOrder) - // PostProcessHandler.depends_on(E) - // - // swap_impls(CGH, PostProcessHandler) - // return; // queue::submit finalizes PostProcessHandler and returns its - // // event if necessary. - // - // Still hackier than "real" `queue::submit` but at least somewhat sane. - // That, however hasn't been tried yet and we have an even hackier approach - // copied from what's been done in an old reductions implementation before - // eventless submission work has started. Not sure how feasible the approach - // above is at this moment. - - // This `finalize` is wrong (at least logically) if - // `assert(!CGH.eventNeeded())` - auto E = CGH.finalize(); - dependent_queue_t Queue = CGH.getQueue(); - bool InOrder = Queue.is_in_order(); - // Cannot use `CGH.eventNeeded()` alone as there might be subsequent - // `postProcess` calls and we cannot address them properly similarly to the - // `finalize` issue described above. `swap_impls` suggested above might be - // able to handle this scenario naturally. - handler AuxHandler(getSyclObjImpl(Queue), CGH.eventNeeded() || !InOrder); - if (!InOrder) - AuxHandler.depends_on(E); - AuxHandler.copyCodeLoc(CGH); - std::forward(Func)(AuxHandler); - CGH.MLastEvent = AuxHandler.finalize(); - return; + static void postProcess(handler &CGH, FunctorTy &Func) { + postProcess(CGH, type_erased_cgfo_ty{Func}); } }; } // namespace detail diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp index d55ee7526b24d..2da2fb89e61d6 100644 --- a/sycl/include/sycl/reduction.hpp +++ b/sycl/include/sycl/reduction.hpp @@ -1066,7 +1066,7 @@ class reduction_impl_algo { std::shared_ptr Counter(malloc_device(1, q), Deleter); CGH.addReduction(Counter); - HandlerAccess::preProcess(CGH, q, + HandlerAccess::preProcess(CGH, [Counter = Counter.get()](handler &AuxHandler) { AuxHandler.memset(Counter, 0, sizeof(int)); }); diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index fe871a0d85566..8f55572622a70 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -2403,5 +2403,73 @@ void handler::copyCodeLoc(const handler &other) { queue handler::getQueue() { return createSyclObjFromImpl(impl->get_queue()); } +namespace detail { +__SYCL_EXPORT void HandlerAccess::preProcess(handler &CGH, + type_erased_cgfo_ty F) { + queue_impl &Q = CGH.impl->get_queue(); + bool EventNeeded = !Q.isInOrder(); +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + handler_impl HandlerImpl{Q, nullptr, EventNeeded}; + handler AuxHandler{HandlerImpl}; +#else + handler AuxHandler{Q.shared_from_this(), EventNeeded}; +#endif + AuxHandler.copyCodeLoc(CGH); + F(AuxHandler); + auto E = AuxHandler.finalize(); + assert(!CGH.MIsFinalized && + "Can't do pre-processing if the command has been enqueued already!"); + if (EventNeeded) + CGH.depends_on(E); +} +__SYCL_EXPORT void HandlerAccess::postProcess(handler &CGH, + type_erased_cgfo_ty F) { + // The "hacky" `handler`s manipulation mentioned near the declaration in + // `handler.hpp` and implemented here is far from perfect. A better approach + // would be + // + // bool OrigNeedsEvent = CGH.needsEvent() + // assert(CGH.not_finalized/enqueued()); + // if (!InOrderQueue) + // CGH.setNeedsEvent() + // + // handler PostProcessHandler(Queue, OrigNeedsEvent) + // auto E = CGH.finalize(); // enqueue original or current last + // // post-process + // if (!InOrder) + // PostProcessHandler.depends_on(E) + // + // swap_impls(CGH, PostProcessHandler) + // return; // queue::submit finalizes PostProcessHandler and returns its + // // event if necessary. + // + // Still hackier than "real" `queue::submit` but at least somewhat sane. + // That, however hasn't been tried yet and we have an even hackier approach + // copied from what's been done in an old reductions implementation before + // eventless submission work has started. Not sure how feasible the approach + // above is at this moment. + + // This `finalize` is wrong (at least logically) if + // `assert(!CGH.eventNeeded())` + auto E = CGH.finalize(); + queue_impl &Q = CGH.impl->get_queue(); + bool InOrder = Q.isInOrder(); + // Cannot use `CGH.eventNeeded()` alone as there might be subsequent + // `postProcess` calls and we cannot address them properly similarly to the + // `finalize` issue described above. `swap_impls` suggested above might be + // able to handle this scenario naturally. +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + handler_impl HandlerImpl{Q, nullptr, CGH.eventNeeded() || !InOrder}; + handler AuxHandler{HandlerImpl}; +#else + handler AuxHandler{Q.shared_from_this(), CGH.eventNeeded() || !InOrder}; +#endif + if (!InOrder) + AuxHandler.depends_on(E); + AuxHandler.copyCodeLoc(CGH); + F(AuxHandler); + CGH.MLastEvent = AuxHandler.finalize(); +} +} // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 3e3461f7503a4..d64837ee4e1ec 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3260,6 +3260,8 @@ _ZN4sycl3_V16detail12buffer_plainC2EmmRKNS0_13property_listESt10unique_ptrINS1_1 _ZN4sycl3_V16detail12compile_implERKNS0_13kernel_bundleILNS0_12bundle_stateE0EEERKSt6vectorINS0_6deviceESaIS8_EERKNS0_13property_listE _ZN4sycl3_V16detail12isOutOfRangeENS0_3vecIiLi4EEENS0_15addressing_modeENS0_5rangeILi3EEE _ZN4sycl3_V16detail12make_contextEmRKSt8functionIFvNS0_14exception_listEEENS0_7backendEbRKSt6vectorINS0_6deviceESaISA_EE +_ZN4sycl3_V16detail13HandlerAccess10preProcessERNS0_7handlerENS1_19type_erased_cgfo_tyE +_ZN4sycl3_V16detail13HandlerAccess11postProcessERNS0_7handlerENS1_19type_erased_cgfo_tyE _ZN4sycl3_V16detail13host_pipe_map3addEPKvPKc _ZN4sycl3_V16detail13lgamma_r_implENS1_9half_impl4halfEPi _ZN4sycl3_V16detail13lgamma_r_implEdPi diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index b2e79d3693bca..9ddafd82955a4 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -4332,6 +4332,8 @@ ?pitched_alloc_device@experimental@oneapi@ext@_V1@sycl@@YAPEAXPEA_KAEBUimage_descriptor@12345@AEBVqueue@45@@Z ?pitched_alloc_device@experimental@oneapi@ext@_V1@sycl@@YAPEAXPEA_K_K1IAEBVdevice@45@AEBVcontext@45@@Z ?pitched_alloc_device@experimental@oneapi@ext@_V1@sycl@@YAPEAXPEA_K_K1IAEBVqueue@45@@Z +?postProcess@HandlerAccess@detail@_V1@sycl@@SAXAEAVhandler@34@Vtype_erased_cgfo_ty@234@@Z +?preProcess@HandlerAccess@detail@_V1@sycl@@SAXAEAVhandler@34@Vtype_erased_cgfo_ty@234@@Z ?prefetch@handler@_V1@sycl@@QEAAXPEBX_K@Z ?prefetch@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KAEBUcode_location@detail@23@@Z ?prefetch@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z