diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index 321f51e8863c7..6c44cbb43fa63 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -187,11 +187,12 @@ class type_erased_cgfo_ty {
 
 public:
   template <class T>
-  type_erased_cgfo_ty(T &f)
-      // NOTE: Even if `T` is a pointer to a function, `&f` is a pointer to a
+  type_erased_cgfo_ty(T &&f)
+      // NOTE: Even if `f` is a pointer to a function, `&f` is a pointer to a
       // pointer to a function and as such can be casted to `void *` (pointer to
       // a function cannot be casted).
-      : object(static_cast<const void *>(&f)), invoker_f(&invoker<T>::call) {}
+      : object(static_cast<const void *>(&f)),
+        invoker_f(&invoker<std::remove_reference_t<T>>::call) {}
   ~type_erased_cgfo_ty() = default;
 
   type_erased_cgfo_ty(const type_erased_cgfo_ty &) = delete;
@@ -3878,14 +3879,6 @@ class HandlerAccess {
     Handler.parallel_for_impl(Range, Props, Kernel);
   }
 
-  template <typename T, typename> struct dependent {
-    using type = T;
-  };
-  template <typename T>
-  using dependent_queue_t = typename dependent<queue, T>::type;
-  template <typename T>
-  using dependent_handler_t = typename dependent<handler, T>::type;
-
   // pre/postProcess are used only for reductions right now, but the
   // abstractions they provide aren't reduction-specific. The main problem they
   // solve is
@@ -3901,71 +3894,16 @@ class HandlerAccess {
   // inside control group function object (lambda above) so we resort to a
   // somewhat hacky way of creating multiple `handler`s and manual finalization
   // of them (instead of the one in `queue::submit`).
-  //
-  // Overloads with `queue &q` are provided in case the caller has it created
-  // already to avoid unnecessary reference count increments associated with
-  // `handler::getQueue()`.
-  template <class FunctorTy>
-  static void preProcess(handler &CGH, dependent_queue_t<FunctorTy> &q,
-                         FunctorTy Func) {
-    bool EventNeeded = !q.is_in_order();
-    handler AuxHandler(getSyclObjImpl(q), EventNeeded);
-    AuxHandler.copyCodeLoc(CGH);
-    std::forward<FunctorTy>(Func)(AuxHandler);
-    auto E = AuxHandler.finalize();
-    assert(!CGH.MIsFinalized &&
-           "Can't do pre-processing if the command has been enqueued already!");
-    if (EventNeeded)
-      CGH.depends_on(E);
-  }
+  __SYCL_EXPORT static void preProcess(handler &CGH, type_erased_cgfo_ty F);
+  __SYCL_EXPORT static void postProcess(handler &CGH, type_erased_cgfo_ty F);
+
   template <class FunctorTy>
-  static void preProcess(dependent_handler_t<FunctorTy> &CGH,
-                         FunctorTy &&Func) {
-    preProcess(CGH, CGH.getQueue(), std::forward<FunctorTy>(Func));
+  static void preProcess(handler &CGH, FunctorTy &Func) {
+    preProcess(CGH, type_erased_cgfo_ty{Func});
   }
   template <class FunctorTy>
-  static void postProcess(dependent_handler_t<FunctorTy> &CGH,
-                          FunctorTy &&Func) {
-    // The "hacky" `handler`s manipulation mentioned above and implemented here
-    // is far from perfect. A better approach would be
-    //
-    //    bool OrigNeedsEvent = CGH.needsEvent()
-    //    assert(CGH.not_finalized/enqueued());
-    //    if (!InOrderQueue)
-    //      CGH.setNeedsEvent()
-    //
-    //    handler PostProcessHandler(Queue, OrigNeedsEvent)
-    //    auto E = CGH.finalize(); // enqueue original or current last
-    //                             // post-process
-    //    if (!InOrder)
-    //      PostProcessHandler.depends_on(E)
-    //
-    //    swap_impls(CGH, PostProcessHandler)
-    //    return; // queue::submit finalizes PostProcessHandler and returns its
-    //            // event if necessary.
-    //
-    // Still hackier than "real" `queue::submit` but at least somewhat sane.
-    // That, however hasn't been tried yet and we have an even hackier approach
-    // copied from what's been done in an old reductions implementation before
-    // eventless submission work has started. Not sure how feasible the approach
-    // above is at this moment.
-
-    // This `finalize` is wrong (at least logically) if
-    //   `assert(!CGH.eventNeeded())`
-    auto E = CGH.finalize();
-    dependent_queue_t<FunctorTy> Queue = CGH.getQueue();
-    bool InOrder = Queue.is_in_order();
-    // Cannot use `CGH.eventNeeded()` alone as there might be subsequent
-    // `postProcess` calls and we cannot address them properly similarly to the
-    // `finalize` issue described above. `swap_impls` suggested above might be
-    // able to handle this scenario naturally.
-    handler AuxHandler(getSyclObjImpl(Queue), CGH.eventNeeded() || !InOrder);
-    if (!InOrder)
-      AuxHandler.depends_on(E);
-    AuxHandler.copyCodeLoc(CGH);
-    std::forward<FunctorTy>(Func)(AuxHandler);
-    CGH.MLastEvent = AuxHandler.finalize();
-    return;
+  static void postProcess(handler &CGH, FunctorTy &Func) {
+    postProcess(CGH, type_erased_cgfo_ty{Func});
   }
 };
 } // namespace detail
diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp
index d55ee7526b24d..2da2fb89e61d6 100644
--- a/sycl/include/sycl/reduction.hpp
+++ b/sycl/include/sycl/reduction.hpp
@@ -1066,7 +1066,7 @@ class reduction_impl_algo {
     std::shared_ptr<int> Counter(malloc_device<int>(1, q), Deleter);
     CGH.addReduction(Counter);
 
-    HandlerAccess::preProcess(CGH, q,
+    HandlerAccess::preProcess(CGH,
                               [Counter = Counter.get()](handler &AuxHandler) {
                                 AuxHandler.memset(Counter, 0, sizeof(int));
                               });
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index fe871a0d85566..8f55572622a70 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -2403,5 +2403,73 @@ void handler::copyCodeLoc(const handler &other) {
 queue handler::getQueue() {
   return createSyclObjFromImpl<queue>(impl->get_queue());
 }
+namespace detail {
+__SYCL_EXPORT void HandlerAccess::preProcess(handler &CGH,
+                                             type_erased_cgfo_ty F) {
+  queue_impl &Q = CGH.impl->get_queue();
+  bool EventNeeded = !Q.isInOrder();
+#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
+  handler_impl HandlerImpl{Q, nullptr, EventNeeded};
+  handler AuxHandler{HandlerImpl};
+#else
+  handler AuxHandler{Q.shared_from_this(), EventNeeded};
+#endif
+  AuxHandler.copyCodeLoc(CGH);
+  F(AuxHandler);
+  auto E = AuxHandler.finalize();
+  assert(!CGH.MIsFinalized &&
+         "Can't do pre-processing if the command has been enqueued already!");
+  if (EventNeeded)
+    CGH.depends_on(E);
+}
+__SYCL_EXPORT void HandlerAccess::postProcess(handler &CGH,
+                                              type_erased_cgfo_ty F) {
+  // The "hacky" `handler`s manipulation mentioned near the declaration in
+  // `handler.hpp` and implemented here is far from perfect. A better approach
+  // would be
+  //
+  //    bool OrigNeedsEvent = CGH.needsEvent()
+  //    assert(CGH.not_finalized/enqueued());
+  //    if (!InOrderQueue)
+  //      CGH.setNeedsEvent()
+  //
+  //    handler PostProcessHandler(Queue, OrigNeedsEvent)
+  //    auto E = CGH.finalize(); // enqueue original or current last
+  //                             // post-process
+  //    if (!InOrder)
+  //      PostProcessHandler.depends_on(E)
+  //
+  //    swap_impls(CGH, PostProcessHandler)
+  //    return; // queue::submit finalizes PostProcessHandler and returns its
+  //            // event if necessary.
+  //
+  // Still hackier than "real" `queue::submit` but at least somewhat sane.
+  // That, however hasn't been tried yet and we have an even hackier approach
+  // copied from what's been done in an old reductions implementation before
+  // eventless submission work has started. Not sure how feasible the approach
+  // above is at this moment.
+
+  // This `finalize` is wrong (at least logically) if
+  //   `assert(!CGH.eventNeeded())`
+  auto E = CGH.finalize();
+  queue_impl &Q = CGH.impl->get_queue();
+  bool InOrder = Q.isInOrder();
+  // Cannot use `CGH.eventNeeded()` alone as there might be subsequent
+  // `postProcess` calls and we cannot address them properly similarly to the
+  // `finalize` issue described above. `swap_impls` suggested above might be
+  // able to handle this scenario naturally.
+#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
+  handler_impl HandlerImpl{Q, nullptr, CGH.eventNeeded() || !InOrder};
+  handler AuxHandler{HandlerImpl};
+#else
+  handler AuxHandler{Q.shared_from_this(), CGH.eventNeeded() || !InOrder};
+#endif
+  if (!InOrder)
+    AuxHandler.depends_on(E);
+  AuxHandler.copyCodeLoc(CGH);
+  F(AuxHandler);
+  CGH.MLastEvent = AuxHandler.finalize();
+}
+} // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index 3e3461f7503a4..d64837ee4e1ec 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3260,6 +3260,8 @@ _ZN4sycl3_V16detail12buffer_plainC2EmmRKNS0_13property_listESt10unique_ptrINS1_1
 _ZN4sycl3_V16detail12compile_implERKNS0_13kernel_bundleILNS0_12bundle_stateE0EEERKSt6vectorINS0_6deviceESaIS8_EERKNS0_13property_listE
 _ZN4sycl3_V16detail12isOutOfRangeENS0_3vecIiLi4EEENS0_15addressing_modeENS0_5rangeILi3EEE
 _ZN4sycl3_V16detail12make_contextEmRKSt8functionIFvNS0_14exception_listEEENS0_7backendEbRKSt6vectorINS0_6deviceESaISA_EE
+_ZN4sycl3_V16detail13HandlerAccess10preProcessERNS0_7handlerENS1_19type_erased_cgfo_tyE
+_ZN4sycl3_V16detail13HandlerAccess11postProcessERNS0_7handlerENS1_19type_erased_cgfo_tyE
 _ZN4sycl3_V16detail13host_pipe_map3addEPKvPKc
 _ZN4sycl3_V16detail13lgamma_r_implENS1_9half_impl4halfEPi
 _ZN4sycl3_V16detail13lgamma_r_implEdPi
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
index b2e79d3693bca..9ddafd82955a4 100644
--- a/sycl/test/abi/sycl_symbols_windows.dump
+++ b/sycl/test/abi/sycl_symbols_windows.dump
@@ -4332,6 +4332,8 @@
 ?pitched_alloc_device@experimental@oneapi@ext@_V1@sycl@@YAPEAXPEA_KAEBUimage_descriptor@12345@AEBVqueue@45@@Z
 ?pitched_alloc_device@experimental@oneapi@ext@_V1@sycl@@YAPEAXPEA_K_K1IAEBVdevice@45@AEBVcontext@45@@Z
 ?pitched_alloc_device@experimental@oneapi@ext@_V1@sycl@@YAPEAXPEA_K_K1IAEBVqueue@45@@Z
+?postProcess@HandlerAccess@detail@_V1@sycl@@SAXAEAVhandler@34@Vtype_erased_cgfo_ty@234@@Z
+?preProcess@HandlerAccess@detail@_V1@sycl@@SAXAEAVhandler@34@Vtype_erased_cgfo_ty@234@@Z
 ?prefetch@handler@_V1@sycl@@QEAAXPEBX_K@Z
 ?prefetch@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KAEBUcode_location@detail@23@@Z
 ?prefetch@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z