diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp index 5def244a01ae0..ff955ce8b9eda 100644 --- a/sycl/include/sycl/reduction.hpp +++ b/sycl/include/sycl/reduction.hpp @@ -100,7 +100,8 @@ using IsReduOptForFastAtomicFetch = std::bool_constant; #else std::bool_constant<((is_sgenfloat_v && sizeof(T) == 4) || - is_sgeninteger_v)&&IsValidAtomicType::value && + is_sgeninteger_v) && + IsValidAtomicType::value && (IsPlus::value || IsMinimum::value || IsMaximum::value || @@ -138,11 +139,12 @@ using IsReduOptForFastReduce = #ifdef SYCL_REDUCTION_DETERMINISTIC std::bool_constant; #else - std::bool_constant<( - (is_sgeninteger_v && (sizeof(T) == 4 || sizeof(T) == 8)) || - is_sgenfloat_v)&&(IsPlus::value || - IsMinimum::value || - IsMaximum::value)>; + std::bool_constant<((is_sgeninteger_v && + (sizeof(T) == 4 || sizeof(T) == 8)) || + is_sgenfloat_v) && + (IsPlus::value || + IsMinimum::value || + IsMaximum::value)>; #endif // std::tuple seems to be a) too heavy and b) not copyable to device now @@ -835,6 +837,10 @@ using __sycl_init_mem_for = std::conditional_t, auto_name, reduction::InitMemKrn>; +__SYCL_EXPORT void +addCounterInit(handler &CGH, std::shared_ptr &Queue, + std::shared_ptr &Counter); + template class reduction_impl_algo { @@ -1075,8 +1081,7 @@ class reduction_impl_algo { std::shared_ptr Counter(malloc_device(1, q), Deleter); CGH.addReduction(Counter); - auto Event = q.memset(Counter.get(), 0, sizeof(int)); - CGH.depends_on(Event); + addCounterInit(CGH, CGH.MQueue, Counter); return Counter.get(); } diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 05c579f78a405..22fcafee0829c 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -27,6 +27,20 @@ namespace sycl { inline namespace _V1 { namespace detail { std::atomic queue_impl::MNextAvailableQueueID = 0; +thread_local bool NestedCallsDetector = false; +class NestedCallsTracker { +public: + NestedCallsTracker() { + if (NestedCallsDetector) + throw sycl::exception( + make_error_code(errc::invalid), + "Calls to sycl::queue::submit cannot be nested. Command group " + "function objects should use the sycl::handler API instead."); + NestedCallsDetector = true; + } + + ~NestedCallsTracker() { NestedCallsDetector = false; } +}; static std::vector getPIEvents(const std::vector &DepEvents) { @@ -330,6 +344,46 @@ void queue_impl::addSharedEvent(const event &Event) { MEventsShared.push_back(Event); } +event queue_impl::submit_impl(const std::function &CGF, + const std::shared_ptr &Self, + const std::shared_ptr &PrimaryQueue, + const std::shared_ptr &SecondaryQueue, + const detail::code_location &Loc, + const SubmitPostProcessF *PostProcess) { + handler Handler(Self, PrimaryQueue, SecondaryQueue, MHostQueue); + Handler.saveCodeLoc(Loc); + + { + NestedCallsTracker tracker; + CGF(Handler); + } + + // Scheduler will later omit events, that are not required to execute tasks. + // Host and interop tasks, however, are not submitted to low-level runtimes + // and require separate dependency management. + const CG::CGTYPE Type = Handler.getType(); + event Event = detail::createSyclObjFromImpl( + std::make_shared()); + + if (PostProcess) { + bool IsKernel = Type == CG::Kernel; + bool KernelUsesAssert = false; + + if (IsKernel) + // Kernel only uses assert if it's non interop one + KernelUsesAssert = !(Handler.MKernel && Handler.MKernel->isInterop()) && + ProgramManager::getInstance().kernelUsesAssert( + Handler.MKernelName.c_str()); + finalizeHandler(Handler, Event); + + (*PostProcess)(IsKernel, KernelUsesAssert, Event); + } else + finalizeHandler(Handler, Event); + + addEvent(Event); + return Event; +} + template event queue_impl::submitWithHandler(const std::shared_ptr &Self, const std::vector &DepEvents, @@ -362,6 +416,7 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr &Self, if (MGraph.expired() && Scheduler::areEventsSafeForSchedulerBypass( ExpandedDepEvents, MContext)) { if (MSupportsDiscardingPiEvents) { + NestedCallsTracker tracker; MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents), /*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr); return createDiscardedEvent(); @@ -369,8 +424,11 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr &Self, event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); auto EventImpl = detail::getSyclObjImpl(ResEvent); - MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents), - &EventImpl->getHandleRef(), EventImpl); + { + NestedCallsTracker tracker; + MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents), + &EventImpl->getHandleRef(), EventImpl); + } if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index dcb965c8a52fa..82334e6467dfd 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -849,53 +849,7 @@ class queue_impl { const std::shared_ptr &PrimaryQueue, const std::shared_ptr &SecondaryQueue, const detail::code_location &Loc, - const SubmitPostProcessF *PostProcess) { - // Flag used to detect nested calls to submit and report an error. - thread_local static bool PreventSubmit = false; - - if (PreventSubmit) { - throw sycl::exception( - make_error_code(errc::invalid), - "Calls to sycl::queue::submit cannot be nested. Command group " - "function objects should use the sycl::handler API instead."); - } - - handler Handler(Self, PrimaryQueue, SecondaryQueue, MHostQueue); - Handler.saveCodeLoc(Loc); - PreventSubmit = true; - try { - CGF(Handler); - } catch (...) { - PreventSubmit = false; - throw; - } - PreventSubmit = false; - - // Scheduler will later omit events, that are not required to execute tasks. - // Host and interop tasks, however, are not submitted to low-level runtimes - // and require separate dependency management. - const CG::CGTYPE Type = Handler.getType(); - event Event = detail::createSyclObjFromImpl( - std::make_shared()); - - if (PostProcess) { - bool IsKernel = Type == CG::Kernel; - bool KernelUsesAssert = false; - - if (IsKernel) - // Kernel only uses assert if it's non interop one - KernelUsesAssert = !(Handler.MKernel && Handler.MKernel->isInterop()) && - ProgramManager::getInstance().kernelUsesAssert( - Handler.MKernelName.c_str()); - finalizeHandler(Handler, Event); - - (*PostProcess)(IsKernel, KernelUsesAssert, Event); - } else - finalizeHandler(Handler, Event); - - addEvent(Event); - return Event; - } + const SubmitPostProcessF *PostProcess); /// Helper function for submitting a memory operation with a handler. /// \param Self is a shared_ptr to this queue. diff --git a/sycl/source/detail/reduction.cpp b/sycl/source/detail/reduction.cpp index d2b858aabdf74..6fd170746d28d 100644 --- a/sycl/source/detail/reduction.cpp +++ b/sycl/source/detail/reduction.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include +#include #include #include @@ -165,6 +166,17 @@ __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr &Queue, return reduGetMaxWGSize(Queue, LocalMemBytesPerWorkItem); } +__SYCL_EXPORT void +addCounterInit(handler &CGH, std::shared_ptr &Queue, + std::shared_ptr &Counter) { + auto EventImpl = std::make_shared(Queue); + EventImpl->setContextImpl(detail::getSyclObjImpl(Queue->get_context())); + EventImpl->setStateIncomplete(); + MemoryManager::fill_usm(Counter.get(), Queue, sizeof(int), 0, {}, + &EventImpl->getHandleRef(), EventImpl); + CGH.depends_on(createSyclObjFromImpl(EventImpl)); +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/test-e2e/Basic/nested_queue_submit.cpp b/sycl/test-e2e/Basic/nested_queue_submit.cpp index da74c357a27a6..dc47c5c7a89ed 100644 --- a/sycl/test-e2e/Basic/nested_queue_submit.cpp +++ b/sycl/test-e2e/Basic/nested_queue_submit.cpp @@ -1,14 +1,22 @@ // RUN: %{build} -I . -o %t.out // RUN: %{run} %t.out -#include +#include #include +#include #include -void nestedSubmit() { +void checkExceptionFields(const sycl::exception &e) { + assert(e.code() == sycl::errc::invalid && "Invalid error code"); + assert(std::string(e.what()) == + "Calls to sycl::queue::submit cannot be nested. Command group " + "function objects should use the sycl::handler API instead." && + "Invalid e.what() string"); +} + +void nestedSubmitParallelFor(sycl::queue &q) { uint32_t n = 1024; std::vector array(n); - sycl::queue q{}; { sycl::buffer buf(array.data(), sycl::range<1>{n}); q.submit([&](sycl::handler &h) { @@ -19,16 +27,41 @@ void nestedSubmit() { } } -int main() { +void nestedSubmitMemset(sycl::queue &q) { + uint32_t n = 1024; + int *data = sycl::malloc_device(n, q); + try { + q.submit([&](sycl::handler &h) { q.memset(data, 0, n * sizeof(int)); }); + } catch (...) { + sycl::free(data, q); + throw; + } + sycl::free(data, q); +} + +template +void test(sycl::queue &Queue, CommandSubmitterT QueueSubmit) { + bool ExceptionHappened = false; try { - nestedSubmit(); + QueueSubmit(Queue); } catch (const sycl::exception &e) { - assert(e.code() == sycl::errc::invalid && "Invalid error code"); - assert(std::string(e.what()) == - "Calls to sycl::queue::submit cannot be nested. Command group " - "function objects should use the sycl::handler API instead." && - "Invalid e.what() string"); + checkExceptionFields(e); + ExceptionHappened = true; } - std::cout << "test passed" << std::endl; + assert(ExceptionHappened); + // Checks that queue is in a valid state: nested call tracker was cleaned up + // after exception and does not prevent from submission of new commands. + Queue.submit([&](sycl::handler &h) {}); + Queue.wait(); +} + +int main() { + sycl::queue q{}; + test(q, nestedSubmitParallelFor); + // All shortcut functions has a common part where nested call detection + // happens. Testing only one of them is enough. + if (q.get_device().get_info()) + test(q, nestedSubmitMemset); + return EXIT_SUCCESS; } diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 488da0e6c272f..0edaaa25b4ba1 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3314,6 +3314,7 @@ _ZN4sycl3_V16detail13lgamma_r_implEfPi _ZN4sycl3_V16detail13make_platformEmNS0_7backendE _ZN4sycl3_V16detail13select_deviceERKSt8functionIFiRKNS0_6deviceEEE _ZN4sycl3_V16detail13select_deviceERKSt8functionIFiRKNS0_6deviceEEERKNS0_7contextE +_ZN4sycl3_V16detail14addCounterInitERNS0_7handlerERSt10shared_ptrINS1_10queue_implEERS4_IiE _ZN4sycl3_V16detail14getBorderColorENS0_19image_channel_orderE _ZN4sycl3_V16detail14tls_code_loc_t5queryEv _ZN4sycl3_V16detail14tls_code_loc_tC1ERKNS1_13code_locationE diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 97d9b2c334d08..81b2807d7b024 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -453,6 +453,12 @@ ??$import_external_semaphore@Uexternal_semaphore_fd@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_semaphore_handle@01234@U?$external_semaphore_descriptor@Uexternal_semaphore_fd@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVqueue@34@@Z ??$import_external_semaphore@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_semaphore_handle@01234@U?$external_semaphore_descriptor@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVdevice@34@AEBVcontext@34@@Z ??$import_external_semaphore@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_semaphore_handle@01234@U?$external_semaphore_descriptor@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVqueue@34@@Z +??$update_nd_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$00@45@@Z +??$update_nd_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$01@45@@Z +??$update_nd_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$02@45@@Z +??$update_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$range@$00@45@@Z +??$update_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$range@$01@45@@Z +??$update_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$range@$02@45@@Z ??0AccessorBaseHost@detail@_V1@sycl@@IEAA@AEBV?$shared_ptr@VAccessorImplHost@detail@_V1@sycl@@@std@@@Z ??0AccessorBaseHost@detail@_V1@sycl@@QEAA@$$QEAV0123@@Z ??0AccessorBaseHost@detail@_V1@sycl@@QEAA@AEBV0123@@Z @@ -3932,6 +3938,7 @@ ?add@device_global_map@detail@_V1@sycl@@YAXPEBXPEBD@Z ?add@host_pipe_map@detail@_V1@sycl@@YAXPEBXPEBD@Z ?add@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAA?AVnode@34567@AEBVproperty_list@67@@Z +?addCounterInit@detail@_V1@sycl@@YAXAEAVhandler@23@AEAV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@AEAV?$shared_ptr@H@6@@Z ?addGraphLeafDependencies@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAAXVnode@34567@@Z ?addHostAccessorAndWait@detail@_V1@sycl@@YAXPEAVAccessorImplHost@123@@Z ?addHostSampledImageAccessorAndWait@detail@_V1@sycl@@YAXPEAVSampledImageAccessorImplHost@123@@Z @@ -4355,12 +4362,6 @@ ?get_mip_level_mem_handle@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AUimage_mem_handle@23456@I@Z ?get_name@kernel_id@_V1@sycl@@QEBAPEBDXZ ?get_node_from_event@node@experimental@oneapi@ext@_V1@sycl@@SA?AV123456@Vevent@56@@Z -??$update_nd_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$00@45@@Z -??$update_nd_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$01@45@@Z -??$update_nd_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$02@45@@Z -??$update_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$range@$00@45@@Z -??$update_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$range@$01@45@@Z -??$update_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$range@$02@45@@Z ?get_nodes@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ ?get_num_channels@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBAIXZ ?get_pipe_name@pipe_base@experimental@intel@ext@_V1@sycl@@KA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEBX@Z