diff --git a/sycl/include/sycl/detail/cg.hpp b/sycl/include/sycl/detail/cg.hpp index 44594a68f3515..47b5a50067db7 100644 --- a/sycl/include/sycl/detail/cg.hpp +++ b/sycl/include/sycl/detail/cg.hpp @@ -497,6 +497,17 @@ class CGCopyFromDeviceGlobal : public CG { size_t getOffset() { return MOffset; } }; +/// "Execute command-buffer" command group class. +class CGExecCommandBuffer : public CG { +public: + sycl::detail::pi::PiExtCommandBuffer MCommandBuffer; + + CGExecCommandBuffer(sycl::detail::pi::PiExtCommandBuffer CommandBuffer, + CG::StorageInitHelper CGData) + : CG(CGTYPE::ExecCommandBuffer, std::move(CGData)), + MCommandBuffer(CommandBuffer) {} +}; + } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index ad3df395a37e9..a06788427d2c6 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -2284,7 +2284,7 @@ __SYCL_EXPORT pi_result piextCommandBufferMemBufferCopyRect( /// \param sync_point The sync_point associated with this memory operation. __SYCL_EXPORT pi_result piextCommandBufferMemBufferRead( pi_ext_command_buffer command_buffer, pi_mem buffer, size_t offset, - size_t size, void *dst, pi_uint32 num_events_in_wait_list, + size_t size, void *dst, pi_uint32 num_sync_points_in_wait_list, const pi_ext_sync_point *sync_point_wait_list, pi_ext_sync_point *sync_point); diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index 3f5ab5a17b690..aecd21f28c060 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -221,8 +221,6 @@ class __SYCL_EXPORT executable_command_graph { int MTag; std::shared_ptr impl; - - friend class modifiable_command_graph; }; } // namespace detail @@ -248,7 +246,10 @@ class command_graph : public detail::modifiable_command_graph { template <> class command_graph : public detail::executable_command_graph { -private: + +protected: + friend command_graph + detail::modifiable_command_graph::finalize(const sycl::property_list &) const; using detail::executable_command_graph::executable_command_graph; }; diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 615776f7c8238..10d20c557c68a 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1542,6 +1542,12 @@ class __SYCL_EXPORT handler { setType(detail::CG::CodeplayHostTask); } + /// @brief Get the command graph if any associated with this handler. It can + /// come from either the associated queue or from being set explicitly through + /// the appropriate constructor. + std::shared_ptr + getCommandGraph() const; + public: handler(const handler &) = delete; handler(handler &&) = delete; diff --git a/sycl/include/sycl/info/ext_oneapi_device_traits.def b/sycl/include/sycl/info/ext_oneapi_device_traits.def index 528b8609e5c8a..eb0db0ffd6bd5 100644 --- a/sycl/include/sycl/info/ext_oneapi_device_traits.def +++ b/sycl/include/sycl/info/ext_oneapi_device_traits.def @@ -9,6 +9,10 @@ __SYCL_PARAM_TRAITS_TEMPLATE_SPEC(ext::oneapi::experimental,device, max_work_gro __SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, device, architecture, ext::oneapi::experimental::architecture, PI_EXT_ONEAPI_DEVICE_INFO_IP_VERSION) +__SYCL_PARAM_TRAITS_SPEC( + ext::oneapi::experimental, device, graph_support, + ext::oneapi::experimental::info::graph_support_level, + 0 /* No PI device code needed */) #ifdef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC_NEEDS_UNDEF #undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC #undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC_NEEDS_UNDEF diff --git a/sycl/include/sycl/info/info_desc.hpp b/sycl/include/sycl/info/info_desc.hpp index 8f194aea3c849..90040767f9589 100644 --- a/sycl/include/sycl/info/info_desc.hpp +++ b/sycl/include/sycl/info/info_desc.hpp @@ -185,9 +185,14 @@ template struct compatibility_param_traits {}; } /*namespace info */ \ } /*namespace Namespace */ -namespace ext::oneapi::experimental::info::device { +namespace ext::oneapi::experimental::info { + +enum class graph_support_level { unsupported = 0, native, emulated }; + +namespace device { template struct max_work_groups; -} // namespace ext::oneapi::experimental::info::device +} // namespace device +} // namespace ext::oneapi::experimental::info #include #include #include diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 8c85ead06eba7..bb9065f352e58 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -76,6 +76,16 @@ static event submitAssertCapture(queue &, event &, queue *, #endif } // namespace detail +namespace ext { +namespace oneapi { +namespace experimental { +// State of a queue with regards to graph recording, +// returned by info::queue::state +enum class queue_state { executing, recording }; +} // namespace experimental +} // namespace oneapi +} // namespace ext + /// Encapsulates a single SYCL queue which schedules kernels on a SYCL device. /// /// A SYCL queue can be used to submit command groups to be executed by the SYCL @@ -283,6 +293,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { /// \return SYCL device this queue was constructed with. device get_device() const; + /// \return State the queue is currently in. + ext::oneapi::experimental::queue_state ext_oneapi_get_state() const; + /// \return true if this queue is a SYCL host queue. __SYCL2020_DEPRECATED( "is_host() is deprecated as the host device is no longer supported.") diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index b94d527297a8b..86b3473027b40 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1118,11 +1118,11 @@ pi_result piextCommandBufferMemBufferCopyRect( pi_result piextCommandBufferMemBufferRead( pi_ext_command_buffer CommandBuffer, pi_mem Buffer, size_t Offset, - size_t Size, void *Dst, pi_uint32 NumEventsInWaitList, + size_t Size, void *Dst, pi_uint32 NumSyncPointsInWaitList, const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { - return pi2ur::piextCommandBufferMemBufferRead(CommandBuffer, Buffer, Offset, - Size, Dst, NumEventsInWaitList, - SyncPointWaitList, SyncPoint); + return pi2ur::piextCommandBufferMemBufferRead( + CommandBuffer, Buffer, Offset, Size, Dst, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); } pi_result piextCommandBufferMemBufferReadRect( @@ -1130,21 +1130,21 @@ pi_result piextCommandBufferMemBufferReadRect( pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, size_t HostRowPitch, size_t HostSlicePitch, void *Ptr, - pi_uint32 NumEventsInWaitList, const pi_ext_sync_point *SyncPointWaitList, - pi_ext_sync_point *SyncPoint) { + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { return pi2ur::piextCommandBufferMemBufferReadRect( CommandBuffer, Buffer, BufferOffset, HostOffset, Region, BufferRowPitch, - BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr, NumEventsInWaitList, - SyncPointWaitList, SyncPoint); + BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } pi_result piextCommandBufferMemBufferWrite( pi_ext_command_buffer CommandBuffer, pi_mem Buffer, size_t Offset, - size_t Size, const void *Ptr, pi_uint32 NumEventsInWaitList, + size_t Size, const void *Ptr, pi_uint32 NumSyncPointsInWaitList, const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { - return pi2ur::piextCommandBufferMemBufferWrite(CommandBuffer, Buffer, Offset, - Size, Ptr, NumEventsInWaitList, - SyncPointWaitList, SyncPoint); + return pi2ur::piextCommandBufferMemBufferWrite( + CommandBuffer, Buffer, Offset, Size, Ptr, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); } pi_result piextCommandBufferMemBufferWriteRect( @@ -1152,12 +1152,12 @@ pi_result piextCommandBufferMemBufferWriteRect( pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, size_t HostRowPitch, size_t HostSlicePitch, const void *Ptr, - pi_uint32 NumEventsInWaitList, const pi_ext_sync_point *SyncPointWaitList, - pi_ext_sync_point *SyncPoint) { + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { return pi2ur::piextCommandBufferMemBufferWriteRect( CommandBuffer, Buffer, BufferOffset, HostOffset, Region, BufferRowPitch, - BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr, NumEventsInWaitList, - SyncPointWaitList, SyncPoint); + BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 7dd6903ffc1ef..d4543f589afe7 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -2726,6 +2726,12 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextCommandBufferMemBufferCopy, piextCommandBufferMemBufferCopy) _PI_CL(piextCommandBufferMemBufferCopyRect, piextCommandBufferMemBufferCopyRect) + _PI_CL(piextCommandBufferMemBufferRead, piextCommandBufferMemBufferRead) + _PI_CL(piextCommandBufferMemBufferReadRect, + piextCommandBufferMemBufferReadRect) + _PI_CL(piextCommandBufferMemBufferWrite, piextCommandBufferMemBufferWrite) + _PI_CL(piextCommandBufferMemBufferWriteRect, + piextCommandBufferMemBufferWriteRect) _PI_CL(piextEnqueueCommandBuffer, piextEnqueueCommandBuffer) _PI_CL(piextKernelSetArgMemObj, piextKernelSetArgMemObj) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/command_buffer.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/command_buffer.cpp index 34740fee92001..11aa79e5e57ae 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/command_buffer.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/command_buffer.cpp @@ -535,9 +535,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp( uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, ur_exp_command_buffer_sync_point_t *SyncPoint) { - (void)SrcOffset; - (void)DstOffset; - auto SrcBuffer = ur_cast(SrcMem); auto DstBuffer = ur_cast(DstMem); @@ -553,8 +550,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp( CommandBuffer->Device)); return enqueueCommandBufferMemCopyHelper( - UR_COMMAND_MEM_BUFFER_COPY, CommandBuffer, ZeHandleDst, ZeHandleSrc, Size, - NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + UR_COMMAND_MEM_BUFFER_COPY, CommandBuffer, ZeHandleDst + DstOffset, + ZeHandleSrc + SrcOffset, Size, NumSyncPointsInWaitList, SyncPointWaitList, + SyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp( diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/device.cpp index 18b1e3b4a3ee1..661ec243fdbf6 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/device.cpp @@ -186,6 +186,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( (Device->ZeDeviceProperties->deviceId & 0xff0) == 0xbd0) SupportedExtensions += ("cl_intel_bfloat16_conversions "); + // Return supported for the UR command-buffer experimental feature + SupportedExtensions += ("ur_exp_command_buffer "); + return ReturnValue(SupportedExtensions.c_str()); } case UR_DEVICE_INFO_NAME: diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index 8ff4294b5febc..9f2fbce2da483 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -884,6 +884,35 @@ struct get_device_info_impl< } }; +// Specialization for graph extension support +template <> +struct get_device_info_impl< + ext::oneapi::experimental::info::graph_support_level, + ext::oneapi::experimental::info::device::graph_support> { + static ext::oneapi::experimental::info::graph_support_level + get(const DeviceImplPtr &Dev) { + size_t ResultSize = 0; + Dev->getPlugin()->call( + Dev->getHandleRef(), PI_DEVICE_INFO_EXTENSIONS, 0, nullptr, + &ResultSize); + if (ResultSize == 0) + return ext::oneapi::experimental::info::graph_support_level::unsupported; + + std::unique_ptr Result(new char[ResultSize]); + Dev->getPlugin()->call( + Dev->getHandleRef(), PI_DEVICE_INFO_EXTENSIONS, ResultSize, + Result.get(), nullptr); + + std::string_view ExtensionsString(Result.get()); + bool CmdBufferSupport = + ExtensionsString.find("ur_exp_command_buffer") != std::string::npos; + return CmdBufferSupport + ? ext::oneapi::experimental::info::graph_support_level::native + : ext::oneapi::experimental::info::graph_support_level:: + unsupported; + } +}; + template typename Param::return_type get_device_info(const DeviceImplPtr &Dev) { static_assert(is_device_info_desc::value, @@ -1778,6 +1807,13 @@ inline uint32_t get_device_info_host< PI_ERROR_INVALID_DEVICE); } +template <> +inline ext::oneapi::experimental::info::graph_support_level +get_device_info_host() { + // No support for graphs on the host device. + return ext::oneapi::experimental::info::graph_support_level::unsupported; +} + } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index bd19c53c5c161..22e0cfb083889 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -226,6 +226,12 @@ void event_impl::wait(std::shared_ptr Self) { throw sycl::exception(make_error_code(errc::invalid), "wait method cannot be used for a discarded event."); + if (MGraph.lock()) { + throw sycl::exception(make_error_code(errc::invalid), + "wait method cannot be used for an event associated " + "with a command graph."); + } + #ifdef XPTI_ENABLE_INSTRUMENTATION void *TelemetryEvent = nullptr; uint64_t IId; diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index f5aad5cf53d2e..660f6bdf1bb9d 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -23,6 +23,9 @@ namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { +namespace ext::oneapi::experimental::detail { +class graph_impl; +} class context; namespace detail { class plugin; @@ -256,6 +259,25 @@ class event_impl { return MContext; } + // Sets a sync point which is used when this event represents an enqueue to a + // Command Bufferr. + void setSyncPoint(sycl::detail::pi::PiExtSyncPoint SyncPoint) { + MSyncPoint = SyncPoint; + } + + // Get the sync point associated with this event. + sycl::detail::pi::PiExtSyncPoint getSyncPoint() const { return MSyncPoint; } + + void setCommandGraph( + std::shared_ptr Graph) { + MGraph = Graph; + } + + std::shared_ptr + getCommandGraph() const { + return MGraph.lock(); + } + protected: // When instrumentation is enabled emits trace event for event wait begin and // returns the telemetry event generated for the wait @@ -302,6 +324,15 @@ class event_impl { std::mutex MMutex; std::condition_variable cv; + /// Store the command graph associated with this event, if any. + /// This event is also be stored in the graph so a weak_ptr is used. + std::weak_ptr MGraph; + + // If this event represents a submission to a + // sycl::detail::pi::PiExtCommandBuffer the sync point for that submission is + // stored here. + sycl::detail::pi::PiExtSyncPoint MSyncPoint; + friend std::vector getOrWaitEvents(std::vector DepEvents, std::shared_ptr Context); diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 10a762a9ddccc..a6eb5d7eaba6f 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -15,6 +15,10 @@ #include #include +// Developer switch to use emulation mode on all backends, even those that +// report native support, this is useful for debugging. +#define FORCE_EMULATION_MODE 0 + namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { @@ -44,24 +48,24 @@ void connectToExitNodes( } } -/// Recursive check if a graph node or its successors contains a given kernel -/// argument. -/// @param[in] Arg The kernel argument to check for. +/// Recursive check if a graph node or its successors contains a given +/// requirement. +/// @param[in] Req The requirement to check for. /// @param[in] CurrentNode The current graph node being checked. /// @param[in,out] Deps The unique list of dependencies which have been -/// identified for this arg. +/// identified for this requirement. /// @return True if a dependency was added in this node or any of its /// successors. -bool checkForArg(const sycl::detail::ArgDesc &Arg, - const std::shared_ptr &CurrentNode, - std::set> &Deps) { +bool checkForRequirement(sycl::detail::AccessorImplHost *Req, + const std::shared_ptr &CurrentNode, + std::set> &Deps) { bool SuccessorAddedDep = false; for (auto &Successor : CurrentNode->MSuccessors) { - SuccessorAddedDep |= checkForArg(Arg, Successor, Deps); + SuccessorAddedDep |= checkForRequirement(Req, Successor, Deps); } if (!CurrentNode->isEmpty() && Deps.find(CurrentNode) == Deps.end() && - CurrentNode->hasArg(Arg) && !SuccessorAddedDep) { + CurrentNode->hasRequirement(Req) && !SuccessorAddedDep) { Deps.insert(CurrentNode); return true; } @@ -142,48 +146,60 @@ graph_impl::add(const std::shared_ptr &Impl, if (Handler.MSubgraphNode) { return Handler.MSubgraphNode; } - if (Handler.MCGType == sycl::detail::CG::None) { - return this->add(Dep); - } return this->add(Handler.MCGType, std::move(Handler.MGraphNodeCG), Dep); } +std::shared_ptr +graph_impl::add(const std::vector Events) { + + std::vector> Deps; + + // Add any nodes specified by event dependencies into the dependency list + for (auto Dep : Events) { + if (auto NodeImpl = MEventsMap.find(Dep); NodeImpl != MEventsMap.end()) { + Deps.push_back(NodeImpl->second); + } else { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Event dependency from handler::depends_on does " + "not correspond to a node within the graph"); + } + } + + return this->add(Deps); +} + std::shared_ptr graph_impl::add(sycl::detail::CG::CGTYPE CGType, std::unique_ptr CommandGroup, const std::vector> &Dep) { // Copy deps so we can modify them auto Deps = Dep; - if (CGType == sycl::detail::CG::Kernel) { - // A unique set of dependencies obtained by checking kernel arguments - // for accessors - std::set> UniqueDeps; - const auto &Args = - static_cast(CommandGroup.get())->MArgs; - for (auto &Arg : Args) { - if (Arg.MType != sycl::detail::kernel_param_kind_t::kind_accessor) { - continue; - } - // Look through the graph for nodes which share this argument - for (auto NodePtr : MRoots) { - checkForArg(Arg, NodePtr, UniqueDeps); - } - } - // Add any deps determined from accessor arguments into the dependency list - Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end()); + // A unique set of dependencies obtained by checking requirements and events + std::set> UniqueDeps; + const auto &Requirements = CommandGroup->getRequirements(); + for (auto &Req : Requirements) { + // Look through the graph for nodes which share this requirement + for (auto NodePtr : MRoots) { + checkForRequirement(Req, NodePtr, UniqueDeps); + } } // Add any nodes specified by event dependencies into the dependency list for (auto Dep : CommandGroup->getEvents()) { if (auto NodeImpl = MEventsMap.find(Dep); NodeImpl != MEventsMap.end()) { - Deps.push_back(NodeImpl->second); + if (UniqueDeps.find(NodeImpl->second) == UniqueDeps.end()) { + UniqueDeps.insert(NodeImpl->second); + } } else { throw sycl::exception(sycl::make_error_code(errc::invalid), "Event dependency from handler::depends_on does " "not correspond to a node within the graph"); } } + // Add any deps determined from requirements and events into the dependency + // list + Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end()); const std::shared_ptr &NodeImpl = std::make_shared(CGType, std::move(CommandGroup)); @@ -210,11 +226,148 @@ bool graph_impl::clearQueues() { return AnyQueuesCleared; } -exec_graph_impl::~exec_graph_impl() { MSchedule.clear(); } +// Check if nodes are empty and if so loop back through predecessors until we +// find the real dependency. +void exec_graph_impl::findRealDeps( + std::vector &Deps, + std::shared_ptr CurrentNode) { + if (CurrentNode->isEmpty()) { + for (auto &N : CurrentNode->MPredecessors) { + auto NodeImpl = N.lock(); + findRealDeps(Deps, NodeImpl); + } + } else { + // Verify that the sync point has actually been set for this node. + auto SyncPoint = MPiSyncPoints.find(CurrentNode); + assert(SyncPoint != MPiSyncPoints.end() && + "No sync point has been set for node dependency."); + // Check if the dependency has already been added. + if (std::find(Deps.begin(), Deps.end(), SyncPoint->second) == Deps.end()) { + Deps.push_back(SyncPoint->second); + } + } +} + +sycl::detail::pi::PiExtSyncPoint exec_graph_impl::enqueueNodeDirect( + sycl::context Ctx, sycl::detail::DeviceImplPtr DeviceImpl, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, + std::shared_ptr Node) { + std::vector Deps; + for (auto &N : Node->MPredecessors) { + findRealDeps(Deps, N.lock()); + } + sycl::detail::pi::PiExtSyncPoint NewSyncPoint; + pi_int32 Res = sycl::detail::enqueueImpCommandBufferKernel( + Ctx, DeviceImpl, CommandBuffer, + *static_cast((Node->MCommandGroup.get())), + Deps, &NewSyncPoint, nullptr); + + if (Res != pi_result::PI_SUCCESS) { + throw sycl::exception(errc::invalid, + "Failed to add kernel to PI command-buffer"); + } + + return NewSyncPoint; +} + +sycl::detail::pi::PiExtSyncPoint exec_graph_impl::enqueueNode( + sycl::context Ctx, std::shared_ptr DeviceImpl, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, + std::shared_ptr Node) { + + // Queue which will be used for allocation operations for accessors. + auto AllocaQueue = std::make_shared( + DeviceImpl, sycl::detail::getSyclObjImpl(Ctx), sycl::async_handler{}, + sycl::property_list{}); + + std::vector Deps; + for (auto &N : Node->MPredecessors) { + findRealDeps(Deps, N.lock()); + } + + sycl::detail::EventImplPtr Event = + sycl::detail::Scheduler::getInstance().addCG( + Node->getCGCopy(), AllocaQueue, CommandBuffer, Deps); + + return Event->getSyncPoint(); +} +void exec_graph_impl::createCommandBuffers(sycl::device Device) { + // TODO we only have a single command-buffer per graph here, but + // this will need to be multiple command-buffers for non-trivial graphs + sycl::detail::pi::PiExtCommandBuffer OutCommandBuffer; + sycl::detail::pi::PiExtCommandBufferDesc Desc{}; + auto ContextImpl = sycl::detail::getSyclObjImpl(MContext); + const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin(); + auto DeviceImpl = sycl::detail::getSyclObjImpl(Device); + pi_result Res = + Plugin->call_nocheck( + ContextImpl->getHandleRef(), DeviceImpl->getHandleRef(), &Desc, + &OutCommandBuffer); + if (Res != pi_result::PI_SUCCESS) { + throw sycl::exception(errc::invalid, "Failed to create PI command-buffer"); + } + + MPiCommandBuffers[Device] = OutCommandBuffer; + + // TODO extract kernel bundle logic from enqueueImpKernel + for (auto Node : MSchedule) { + sycl::detail::CG::CGTYPE type = Node->MCGType; + // If the node is a kernel with no special requirements we can enqueue it + // directly. + if (type == sycl::detail::CG::Kernel && + Node->MCommandGroup->getRequirements().size() + + static_cast( + Node->MCommandGroup.get()) + ->MStreams.size() == + 0) { + MPiSyncPoints[Node] = + enqueueNodeDirect(MContext, DeviceImpl, OutCommandBuffer, Node); + } else { + MPiSyncPoints[Node] = + enqueueNode(MContext, DeviceImpl, OutCommandBuffer, Node); + } + + // Append Node requirements to overall graph requirements + MRequirements.insert(MRequirements.end(), + Node->MCommandGroup->getRequirements().begin(), + Node->MCommandGroup->getRequirements().end()); + } + + Res = + Plugin->call_nocheck( + OutCommandBuffer); + if (Res != pi_result::PI_SUCCESS) { + throw sycl::exception(errc::invalid, + "Failed to finalize PI command-buffer"); + } +} + +exec_graph_impl::~exec_graph_impl() { + // clear all recording queue if not done before (no call to end_recording) + MGraphImpl->clearQueues(); + + const sycl::detail::PluginPtr &Plugin = + sycl::detail::getSyclObjImpl(MContext)->getPlugin(); + MSchedule.clear(); + // We need to wait on all command buffer executions before we can release + // them. + for (auto &Event : MExecutionEvents) { + Event->wait(Event); + } -sycl::event exec_graph_impl::enqueue( - const std::shared_ptr &Queue) { - std::vector RawEvents; + for (auto Iter : MPiCommandBuffers) { + if (auto CmdBuf = Iter.second; CmdBuf) { + pi_result Res = Plugin->call_nocheck< + sycl::detail::PiApiKind::piextCommandBufferRelease>(CmdBuf); + (void)Res; + assert(Res == pi_result::PI_SUCCESS); + } + } +} + +sycl::event +exec_graph_impl::enqueue(const std::shared_ptr &Queue, + sycl::detail::CG::StorageInitHelper CGData) { auto CreateNewEvent([&]() { auto NewEvent = std::make_shared(Queue); NewEvent->setContextImpl(Queue->getContextImplPtr()); @@ -222,9 +375,38 @@ sycl::event exec_graph_impl::enqueue( return NewEvent; }); + auto CommandBuffer = MPiCommandBuffers[Queue->get_device()]; sycl::detail::EventImplPtr NewEvent; - { + if (CommandBuffer) { + NewEvent = CreateNewEvent(); + sycl::detail::pi::PiEvent *OutEvent = &NewEvent->getHandleRef(); + // Merge requirements from the nodes into requirements (if any) from the + // handler. + CGData.MRequirements.insert(CGData.MRequirements.end(), + MRequirements.begin(), MRequirements.end()); + // If we have no requirements or dependent events for the command buffer, + // enqueue it directly + if (CGData.MRequirements.empty() && CGData.MEvents.empty()) { + pi_result Res = + Queue->getPlugin() + ->call_nocheck< + sycl::detail::PiApiKind::piextEnqueueCommandBuffer>( + CommandBuffer, Queue->getHandleRef(), 0, nullptr, OutEvent); + if (Res != pi_result::PI_SUCCESS) { + throw sycl::exception( + errc::event, + "Failed to enqueue event for command buffer submission"); + } + } else { + std::unique_ptr CommandGroup = + std::make_unique( + CommandBuffer, std::move(CGData)); + + NewEvent = sycl::detail::Scheduler::getInstance().addCG( + std::move(CommandGroup), Queue); + } + } else { std::vector> ScheduledEvents; for (auto &NodeImpl : MSchedule) { std::vector RawEvents; @@ -271,6 +453,9 @@ sycl::event exec_graph_impl::enqueue( NewEvent->getPreparedDepsEvents() = ScheduledEvents; } + // Keep track of this execution event so we can make sure it's completed in + // the destructor. + MExecutionEvents.push_back(NewEvent); sycl::event QueueEvent = sycl::detail::createSyclObjFromImpl(NewEvent); return QueueEvent; @@ -322,6 +507,18 @@ modifiable_command_graph::finalize(const sycl::property_list &) const { bool modifiable_command_graph::begin_recording(queue &RecordingQueue) { auto QueueImpl = sycl::detail::getSyclObjImpl(RecordingQueue); + + if (QueueImpl->get_context() != impl->getContext()) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "begin_recording called for a queue whose context " + "differs from the graph context."); + } + if (QueueImpl->get_device() != impl->getDevice()) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "begin_recording called for a queue whose device " + "differs from the graph device."); + } + if (QueueImpl->getCommandGraph() == nullptr) { QueueImpl->setCommandGraph(impl); impl->addQueue(QueueImpl); @@ -332,7 +529,6 @@ bool modifiable_command_graph::begin_recording(queue &RecordingQueue) { "begin_recording called for a queue which is already " "recording to a different graph."); } - // Queue was already recording to this graph. return false; } @@ -384,6 +580,24 @@ executable_command_graph::executable_command_graph( void executable_command_graph::finalizeImpl() { // Create PI command-buffers for each device in the finalized context impl->schedule(); + + auto Context = impl->getContext(); + for (auto Device : Context.get_devices()) { + bool CmdBufSupport = + Device.get_info< + ext::oneapi::experimental::info::device::graph_support>() == + info::graph_support_level::native; + +#if FORCE_EMULATION_MODE + // Above query should still succeed in emulation mode, but ignore the + // result and use emulation. + CmdBufSupport = false; +#endif + + if (CmdBufSupport) { + impl->createCommandBuffers(Device); + } + } } void executable_command_graph::update( diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index ae6c5ecdfaa46..51c59db9a71c9 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -13,6 +13,7 @@ #include #include +#include #include #include @@ -42,9 +43,6 @@ class node_impl { /// Command group object which stores all args etc needed to enqueue the node std::unique_ptr MCommandGroup; - /// True if an empty node, false otherwise. - bool MIsEmpty = false; - /// Add successor to the node. /// @param Node Node to add as a successor. /// @param Prev Predecessor to \p node being added as successor. @@ -64,7 +62,7 @@ class node_impl { } /// Construct an empty node. - node_impl() : MIsEmpty(true) {} + node_impl() {} /// Construct a node representing a command-group. /// @param CGType Type of the command-group. @@ -90,23 +88,14 @@ class node_impl { Schedule.push_front(NodeImpl); } - /// Checks if this node has an argument. - /// @param Arg Argument to lookup. - /// @return True if \p Arg is used in node, false otherwise. - bool hasArg(const sycl::detail::ArgDesc &Arg) { - // TODO: Handle types other than exec kernel - assert(MCGType == sycl::detail::CG::Kernel); - const auto &Args = - static_cast(MCommandGroup.get())->MArgs; - for (auto &NodeArg : Args) { - if (Arg.MType == NodeArg.MType && Arg.MSize == NodeArg.MSize) { - // Args are actually void** so we need to dereference them to compare - // actual values - void *IncomingPtr = *static_cast(Arg.MPtr); - void *ArgPtr = *static_cast(NodeArg.MPtr); - if (IncomingPtr == ArgPtr) { - return true; - } + /// Checks if this node has a given requirement. + /// @param Requirement Requirement to lookup. + /// @return True if \p Requirement is present in node, false otherwise. + bool hasRequirement(sycl::detail::AccessorImplHost *IncomingReq) { + for (sycl::detail::AccessorImplHost *CurrentReq : + MCommandGroup->getRequirements()) { + if (IncomingReq->MSYCLMemObj == CurrentReq->MSYCLMemObj) { + return true; } } return false; @@ -114,7 +103,7 @@ class node_impl { /// Query if this is an empty node. /// @return True if this is an empty node, false otherwise. - bool isEmpty() const { return MIsEmpty; } + bool isEmpty() const { return MCGType == sycl::detail::CG::None; } /// Get a deep copy of this node's command group /// @return A unique ptr to the new command group object. @@ -190,7 +179,7 @@ class graph_impl { /// @param SyclDevice Device to create nodes with. graph_impl(const sycl::context &SyclContext, const sycl::device &SyclDevice) : MContext(SyclContext), MDevice(SyclDevice), MRecordingQueues(), - MEventsMap() {} + MEventsMap(), MInorderQueueMap() {} /// Insert node into list of root nodes. /// @param Root Node to add to list of root nodes. @@ -228,6 +217,12 @@ class graph_impl { std::shared_ptr add(const std::vector> &Dep = {}); + /// Create an empty node in the graph. + /// @param Events List of events associated to this node. + /// @return Created node in the graph. + std::shared_ptr + add(const std::vector Events); + /// Add a queue to the set of queues which are currently recording to this /// graph. /// @param RecordingQueue Queue to add to set. @@ -276,6 +271,7 @@ class graph_impl { } /// Adds sub-graph nodes from an executable graph to this graph. + /// @param NodeList List of nodes from sub-graph in schedule order. /// @return An empty node is used to schedule dependencies on this sub-graph. std::shared_ptr addSubgraphNodes(const std::list> &NodeList); @@ -284,9 +280,35 @@ class graph_impl { /// @return Context associated with graph. sycl::context getContext() const { return MContext; } + /// Query for the device tied to this graph. + /// @return Device associated with graph. + sycl::device getDevice() const { return MDevice; } + /// List of root nodes. std::set> MRoots; + /// Find the last node added to this graph from an in-order queue. + /// @param Queue In-order queue to find the last node added to the graph from. + /// @return Last node in this graph added from \p Queue recording, or empty + /// shared pointer if none. + std::shared_ptr + getLastInorderNode(std::shared_ptr Queue) { + std::weak_ptr QueueWeakPtr(Queue); + if (0 == MInorderQueueMap.count(QueueWeakPtr)) { + return {}; + } + return MInorderQueueMap[QueueWeakPtr]; + } + + /// Track the last node added to this graph from an in-order queue. + /// @param Queue In-order queue to register \p Node for. + /// @param Node Last node that was added to this graph from \p Queue. + void setLastInorderNode(std::shared_ptr Queue, + std::shared_ptr Node) { + std::weak_ptr QueueWeakPtr(Queue); + MInorderQueueMap[QueueWeakPtr] = Node; + } + private: /// Context associated with this graph. sycl::context MContext; @@ -299,6 +321,12 @@ class graph_impl { std::unordered_map, std::shared_ptr> MEventsMap; + /// Map for every in-order queue thats recorded a node to the graph, what + /// the last node added was. We can use this to create new edges on the last + /// node if any more nodes are added to the graph from the queue. + std::map, std::shared_ptr, + std::owner_less>> + MInorderQueueMap; }; /// Class representing the implementation of command_graph. @@ -309,7 +337,9 @@ class exec_graph_impl { /// @param GraphImpl Modifiable graph implementation to create with. exec_graph_impl(sycl::context Context, const std::shared_ptr &GraphImpl) - : MSchedule(), MGraphImpl(GraphImpl), MContext(Context) {} + : MSchedule(), MGraphImpl(GraphImpl), MPiCommandBuffers(), + MPiSyncPoints(), MContext(Context), MRequirements(), + MExecutionEvents() {} /// Destructor. /// @@ -322,8 +352,15 @@ class exec_graph_impl { /// Called by handler::ext_oneapi_command_graph() to schedule graph for /// execution. /// @param Queue Command-queue to schedule execution on. + /// @param CGData Command-group data provided by the sycl::handler /// @return Event associated with the execution of the graph. - sycl::event enqueue(const std::shared_ptr &Queue); + sycl::event enqueue(const std::shared_ptr &Queue, + sycl::detail::CG::StorageInitHelper CGData); + + /// Turns the internal graph representation into UR command-buffers for a + /// device. + /// @param Device Device to create backend command-buffers for. + void createCommandBuffers(sycl::device Device); /// Query for the context tied to this graph. /// @return Context associated with graph. @@ -336,16 +373,56 @@ class exec_graph_impl { } private: + /// Create a command-group for the node and add it to command-buffer by going + /// through the scheduler. + /// @param Ctx Context to use. + /// @param DeviceImpl Device associated with the enqueue. + /// @param CommandBuffer Command-buffer to add node to as a command. + /// @param Node The node being enqueued. + /// @return PI sync point created for this node in the command-buffer. + sycl::detail::pi::PiExtSyncPoint + enqueueNode(sycl::context Ctx, sycl::detail::DeviceImplPtr DeviceImpl, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, + std::shared_ptr Node); + + /// Enqueue a node directly to the command-buffer without going through the + /// scheduler. + /// @param Ctx Context to use. + /// @param DeviceImpl Device associated with the enqueue. + /// @param CommandBuffer Command-buffer to add node to as a command. + /// @param Node The node being enqueued. + /// @return PI sync point created for this node in the command-buffer. + sycl::detail::pi::PiExtSyncPoint + enqueueNodeDirect(sycl::context Ctx, sycl::detail::DeviceImplPtr DeviceImpl, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, + std::shared_ptr Node); + + /// Iterates back through predecessors to find the real dependency. + /// @param[out] Deps Found dependencies. + /// @param[in] CurrentNode Node to find dependencies for. + void findRealDeps(std::vector &Deps, + std::shared_ptr CurrentNode); + /// Execution schedule of nodes in the graph. std::list> MSchedule; /// Pointer to the modifiable graph impl associated with this executable /// graph. std::shared_ptr MGraphImpl; + /// Map of devices to command buffers. + std::unordered_map + MPiCommandBuffers; + /// Map of nodes in the exec graph to the sync point representing their + /// execution in the command graph. + std::unordered_map, + sycl::detail::pi::PiExtSyncPoint> + MPiSyncPoints; /// Context associated with this executable graph. sycl::context MContext; /// List of requirements for enqueueing this command graph, accumulated from /// all nodes enqueued to the graph. std::vector MRequirements; + /// List of all execution events returned from command buffer enqueue calls. + std::vector MExecutionEvents; }; } // namespace detail diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 0920f509456d0..b3c76220c9e1b 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -1234,6 +1234,204 @@ void MemoryManager::copy_from_device_global( DepEvents, OutEvent); } +// Command buffer methods +void MemoryManager::ext_oneapi_copyD2D_cmd_buffer( + sycl::detail::ContextImplPtr Context, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, + void *SrcMem, unsigned int DimSrc, sycl::range<3> SrcSize, + sycl::range<3> SrcAccessRange, sycl::id<3> SrcOffset, + unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, + sycl::range<3> DstSize, sycl::range<3> DstAccessRange, + sycl::id<3> DstOffset, unsigned int DstElemSize, + std::vector Deps, + sycl::detail::pi::PiExtSyncPoint *OutSyncPoint) { + assert(SYCLMemObj && "The SYCLMemObj is nullptr"); + (void)DstAccessRange; + + const PluginPtr &Plugin = Context->getPlugin(); + + detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType(); + TermPositions SrcPos, DstPos; + prepTermPositions(SrcPos, DimSrc, MemType); + prepTermPositions(DstPos, DimDst, MemType); + + size_t DstXOffBytes = DstOffset[DstPos.XTerm] * DstElemSize; + size_t SrcXOffBytes = SrcOffset[SrcPos.XTerm] * SrcElemSize; + size_t SrcAccessRangeWidthBytes = SrcAccessRange[SrcPos.XTerm] * SrcElemSize; + size_t DstSzWidthBytes = DstSize[DstPos.XTerm] * DstElemSize; + size_t SrcSzWidthBytes = SrcSize[SrcPos.XTerm] * SrcElemSize; + + if (MemType != detail::SYCLMemObjI::MemObjType::Buffer) { + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "Images are not supported in Graphs"); + } + + if (1 == DimDst && 1 == DimSrc) { + Plugin->call( + CommandBuffer, sycl::detail::pi::cast(SrcMem), + sycl::detail::pi::cast(DstMem), SrcXOffBytes, + DstXOffBytes, SrcAccessRangeWidthBytes, Deps.size(), Deps.data(), + OutSyncPoint); + } else { + // passing 0 for pitches not allowed. Because clEnqueueCopyBufferRect will + // calculate both src and dest pitch using region[0], which is not correct + // if src and dest are not the same size. + size_t SrcRowPitch = SrcSzWidthBytes; + size_t SrcSlicePitch = (DimSrc <= 1) + ? SrcSzWidthBytes + : SrcSzWidthBytes * SrcSize[SrcPos.YTerm]; + size_t DstRowPitch = DstSzWidthBytes; + size_t DstSlicePitch = (DimDst <= 1) + ? DstSzWidthBytes + : DstSzWidthBytes * DstSize[DstPos.YTerm]; + + pi_buff_rect_offset_struct SrcOrigin{SrcXOffBytes, SrcOffset[SrcPos.YTerm], + SrcOffset[SrcPos.ZTerm]}; + pi_buff_rect_offset_struct DstOrigin{DstXOffBytes, DstOffset[DstPos.YTerm], + DstOffset[DstPos.ZTerm]}; + pi_buff_rect_region_struct Region{SrcAccessRangeWidthBytes, + SrcAccessRange[SrcPos.YTerm], + SrcAccessRange[SrcPos.ZTerm]}; + + Plugin->call( + CommandBuffer, sycl::detail::pi::cast(SrcMem), + sycl::detail::pi::cast(DstMem), &SrcOrigin, + &DstOrigin, &Region, SrcRowPitch, SrcSlicePitch, DstRowPitch, + DstSlicePitch, Deps.size(), Deps.data(), OutSyncPoint); + } +} + +void MemoryManager::ext_oneapi_copyD2H_cmd_buffer( + sycl::detail::ContextImplPtr Context, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, + void *SrcMem, unsigned int DimSrc, sycl::range<3> SrcSize, + sycl::range<3> SrcAccessRange, sycl::id<3> SrcOffset, + unsigned int SrcElemSize, char *DstMem, unsigned int DimDst, + sycl::range<3> DstSize, sycl::id<3> DstOffset, unsigned int DstElemSize, + std::vector Deps, + sycl::detail::pi::PiExtSyncPoint *OutSyncPoint) { + assert(SYCLMemObj && "The SYCLMemObj is nullptr"); + + const PluginPtr &Plugin = Context->getPlugin(); + + detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType(); + TermPositions SrcPos, DstPos; + prepTermPositions(SrcPos, DimSrc, MemType); + prepTermPositions(DstPos, DimDst, MemType); + + size_t DstXOffBytes = DstOffset[DstPos.XTerm] * DstElemSize; + size_t SrcXOffBytes = SrcOffset[SrcPos.XTerm] * SrcElemSize; + size_t SrcAccessRangeWidthBytes = SrcAccessRange[SrcPos.XTerm] * SrcElemSize; + size_t DstSzWidthBytes = DstSize[DstPos.XTerm] * DstElemSize; + size_t SrcSzWidthBytes = SrcSize[SrcPos.XTerm] * SrcElemSize; + + if (MemType != detail::SYCLMemObjI::MemObjType::Buffer) { + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "Images are not supported in Graphs"); + } + + if (1 == DimDst && 1 == DimSrc) { + Plugin->call( + CommandBuffer, sycl::detail::pi::cast(SrcMem), + SrcXOffBytes, SrcAccessRangeWidthBytes, DstMem + DstXOffBytes, + Deps.size(), Deps.data(), OutSyncPoint); + } else { + size_t BufferRowPitch = (1 == DimSrc) ? 0 : SrcSzWidthBytes; + size_t BufferSlicePitch = + (3 == DimSrc) ? SrcSzWidthBytes * SrcSize[SrcPos.YTerm] : 0; + size_t HostRowPitch = (1 == DimDst) ? 0 : DstSzWidthBytes; + size_t HostSlicePitch = + (3 == DimDst) ? DstSzWidthBytes * DstSize[DstPos.YTerm] : 0; + + pi_buff_rect_offset_struct BufferOffset{ + SrcXOffBytes, SrcOffset[SrcPos.YTerm], SrcOffset[SrcPos.ZTerm]}; + pi_buff_rect_offset_struct HostOffset{DstXOffBytes, DstOffset[DstPos.YTerm], + DstOffset[DstPos.ZTerm]}; + pi_buff_rect_region_struct RectRegion{SrcAccessRangeWidthBytes, + SrcAccessRange[SrcPos.YTerm], + SrcAccessRange[SrcPos.ZTerm]}; + + Plugin->call( + CommandBuffer, sycl::detail::pi::cast(SrcMem), + &BufferOffset, &HostOffset, &RectRegion, BufferRowPitch, + BufferSlicePitch, HostRowPitch, HostSlicePitch, DstMem, Deps.size(), + Deps.data(), OutSyncPoint); + } +} + +void MemoryManager::ext_oneapi_copyH2D_cmd_buffer( + sycl::detail::ContextImplPtr Context, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, + char *SrcMem, unsigned int DimSrc, sycl::range<3> SrcSize, + sycl::id<3> SrcOffset, unsigned int SrcElemSize, void *DstMem, + unsigned int DimDst, sycl::range<3> DstSize, sycl::range<3> DstAccessRange, + sycl::id<3> DstOffset, unsigned int DstElemSize, + std::vector Deps, + sycl::detail::pi::PiExtSyncPoint *OutSyncPoint) { + assert(SYCLMemObj && "The SYCLMemObj is nullptr"); + + const PluginPtr &Plugin = Context->getPlugin(); + + detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType(); + TermPositions SrcPos, DstPos; + prepTermPositions(SrcPos, DimSrc, MemType); + prepTermPositions(DstPos, DimDst, MemType); + + size_t DstXOffBytes = DstOffset[DstPos.XTerm] * DstElemSize; + size_t SrcXOffBytes = SrcOffset[SrcPos.XTerm] * SrcElemSize; + size_t DstAccessRangeWidthBytes = DstAccessRange[DstPos.XTerm] * DstElemSize; + size_t DstSzWidthBytes = DstSize[DstPos.XTerm] * DstElemSize; + size_t SrcSzWidthBytes = SrcSize[SrcPos.XTerm] * SrcElemSize; + + if (MemType != detail::SYCLMemObjI::MemObjType::Buffer) { + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "Images are not supported in Graphs"); + } + + if (1 == DimDst && 1 == DimSrc) { + Plugin->call( + CommandBuffer, sycl::detail::pi::cast(DstMem), + DstXOffBytes, DstAccessRangeWidthBytes, SrcMem + SrcXOffBytes, + Deps.size(), Deps.data(), OutSyncPoint); + } else { + size_t BufferRowPitch = (1 == DimDst) ? 0 : DstSzWidthBytes; + size_t BufferSlicePitch = + (3 == DimDst) ? DstSzWidthBytes * DstSize[DstPos.YTerm] : 0; + size_t HostRowPitch = (1 == DimSrc) ? 0 : SrcSzWidthBytes; + size_t HostSlicePitch = + (3 == DimSrc) ? SrcSzWidthBytes * SrcSize[SrcPos.YTerm] : 0; + + pi_buff_rect_offset_struct BufferOffset{ + DstXOffBytes, DstOffset[DstPos.YTerm], DstOffset[DstPos.ZTerm]}; + pi_buff_rect_offset_struct HostOffset{SrcXOffBytes, SrcOffset[SrcPos.YTerm], + SrcOffset[SrcPos.ZTerm]}; + pi_buff_rect_region_struct RectRegion{DstAccessRangeWidthBytes, + DstAccessRange[DstPos.YTerm], + DstAccessRange[DstPos.ZTerm]}; + + Plugin->call( + CommandBuffer, sycl::detail::pi::cast(DstMem), + &BufferOffset, &HostOffset, &RectRegion, BufferRowPitch, + BufferSlicePitch, HostRowPitch, HostSlicePitch, SrcMem, Deps.size(), + Deps.data(), OutSyncPoint); + } +} + +void MemoryManager::ext_oneapi_copy_usm_cmd_buffer( + ContextImplPtr Context, const void *SrcMem, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, size_t Len, + void *DstMem, std::vector Deps, + sycl::detail::pi::PiExtSyncPoint *OutSyncPoint) { + if (!SrcMem || !DstMem) + throw runtime_error("NULL pointer argument in memory copy operation.", + PI_ERROR_INVALID_VALUE); + + const PluginPtr &Plugin = Context->getPlugin(); + Plugin->call( + CommandBuffer, DstMem, SrcMem, Len, Deps.size(), Deps.data(), + OutSyncPoint); +} + } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp index 3365d941fffda..87cacdd830336 100644 --- a/sycl/source/detail/memory_manager.hpp +++ b/sycl/source/detail/memory_manager.hpp @@ -186,6 +186,47 @@ class __SYCL_EXPORT MemoryManager { size_t NumBytes, size_t Offset, void *DstMem, const std::vector &DepEvents, sycl::detail::pi::PiEvent *OutEvent); + + // Command buffer extension methods + static void ext_oneapi_copyD2D_cmd_buffer( + sycl::detail::ContextImplPtr Context, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, + SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, + sycl::range<3> SrcSize, sycl::range<3> SrcAccessRange, + sycl::id<3> SrcOffset, unsigned int SrcElemSize, void *DstMem, + unsigned int DimDst, sycl::range<3> DstSize, + sycl::range<3> DstAccessRange, sycl::id<3> DstOffset, + unsigned int DstElemSize, + std::vector Deps, + sycl::detail::pi::PiExtSyncPoint *OutSyncPoint); + + static void ext_oneapi_copyD2H_cmd_buffer( + sycl::detail::ContextImplPtr Context, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, + SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, + sycl::range<3> SrcSize, sycl::range<3> SrcAccessRange, + sycl::id<3> SrcOffset, unsigned int SrcElemSize, char *DstMem, + unsigned int DimDst, sycl::range<3> DstSize, sycl::id<3> DstOffset, + unsigned int DstElemSize, + std::vector Deps, + sycl::detail::pi::PiExtSyncPoint *OutSyncPoint); + + static void ext_oneapi_copyH2D_cmd_buffer( + sycl::detail::ContextImplPtr Context, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, + SYCLMemObjI *SYCLMemObj, char *SrcMem, unsigned int DimSrc, + sycl::range<3> SrcSize, sycl::id<3> SrcOffset, unsigned int SrcElemSize, + void *DstMem, unsigned int DimDst, sycl::range<3> DstSize, + sycl::range<3> DstAccessRange, sycl::id<3> DstOffset, + unsigned int DstElemSize, + std::vector Deps, + sycl::detail::pi::PiExtSyncPoint *OutSyncPoint); + + static void ext_oneapi_copy_usm_cmd_buffer( + ContextImplPtr Context, const void *SrcMem, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, size_t Len, + void *DstMem, std::vector Deps, + sycl::detail::pi::PiExtSyncPoint *OutSyncPoint); }; } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index b0e1f23285eac..9be486bba066b 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -145,6 +145,16 @@ event queue_impl::memcpy(const std::shared_ptr &Self, // Emit a begin/end scope for this call PrepareNotify.scopedNotify((uint16_t)xpti::trace_point_type_t::task_begin); #endif + // If we have a command graph set we need to capture the copy through normal + // queue submission rather than execute the copy directly. + if (MGraph) { + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.memcpy(Dest, Src, Count); + }, + Self, {}); + } if (MHasDiscardEventsSupport) { MemoryManager::copy_usm(Src, Self, Count, Dest, getOrWaitEvents(DepEvents, MContext), nullptr); @@ -466,6 +476,12 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId); #endif + if (MGraph) { + throw sycl::exception(make_error_code(errc::invalid), + "wait cannot be called for a queue which is " + "recording to a command graph."); + } + std::vector> WeakEvents; std::vector SharedEvents; { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index fb9c7e455dc46..a30a46bde31e0 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -32,6 +32,8 @@ #include #include +#include "detail/graph_impl.hpp" + #include #ifdef XPTI_ENABLE_INSTRUMENTATION diff --git a/sycl/source/detail/reduction.cpp b/sycl/source/detail/reduction.cpp index 5134f9b51996c..47ba843243cda 100644 --- a/sycl/source/detail/reduction.cpp +++ b/sycl/source/detail/reduction.cpp @@ -52,6 +52,16 @@ __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, // with the given queue. __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups( std::shared_ptr Queue) { + // TODO: Graphs extension explicit API uses a handler with no queue attached, + // so return some value here. In the future we should have access to the + // device so can remove this. + // + // The 8 value was chosen as the hardcoded value as it is the returned + // value for sycl::info::device::max_compute_units on + // Intel HD Graphics devices used as a L0 backend during development. + if (Queue == nullptr) { + return 8; + } device Dev = Queue->get_device(); uint32_t NumThreads = Dev.get_info(); // TODO: The heuristics here require additional tuning for various devices @@ -104,6 +114,16 @@ reduGetMaxWGSize(std::shared_ptr Queue, __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr &Queue, size_t LocalMemBytesPerWorkItem) { + // TODO: Graphs extension explicit API uses a handler with a null queue to + // process CGFs, in future we should have access to the device so we can + // correctly calculate this. + // + // The 32 value was chosen as the hardcoded value as it is the returned + // value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on + // Intel HD Graphics devices used as a L0 backend during development. + if (Queue == nullptr) { + return 32; + } device Dev = Queue->get_device(); // The maximum WGSize returned by CPU devices is very large and does not diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index a24b7d9cb8be3..3b40edd1961bc 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -94,7 +94,7 @@ static std::string deviceToString(device Device) { return "UNKNOWN"; } -static void applyFuncOnFilteredArgs( +void applyFuncOnFilteredArgs( const KernelArgMask *EliminatedArgMask, std::vector &Args, std::function Func) { if (!EliminatedArgMask) { @@ -482,12 +482,15 @@ void Command::waitForEvents(QueueImplPtr Queue, /// It is safe to bind MPreparedDepsEvents and MPreparedHostDepsEvents /// references to event_impl class members because Command /// should not outlive the event connected to it. -Command::Command(CommandType Type, QueueImplPtr Queue) +Command::Command( + CommandType Type, QueueImplPtr Queue, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, + const std::vector &SyncPoints) : MQueue(std::move(Queue)), MEvent(std::make_shared(MQueue)), MPreparedDepsEvents(MEvent->getPreparedDepsEvents()), - MPreparedHostDepsEvents(MEvent->getPreparedHostDepsEvents()), - MType(Type) { + MPreparedHostDepsEvents(MEvent->getPreparedHostDepsEvents()), MType(Type), + MCommandBuffer(CommandBuffer), MSyncPointDeps(SyncPoints) { MWorkerQueue = MQueue; MEvent->setWorkerQueue(MWorkerQueue); MEvent->setSubmittedQueue(MWorkerQueue); @@ -1848,9 +1851,12 @@ static std::string cgTypeToString(detail::CG::CGTYPE Type) { } } -ExecCGCommand::ExecCGCommand(std::unique_ptr CommandGroup, - QueueImplPtr Queue) - : Command(CommandType::RUN_CG, std::move(Queue)), +ExecCGCommand::ExecCGCommand( + std::unique_ptr CommandGroup, QueueImplPtr Queue, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, + const std::vector &Dependencies) + : Command(CommandType::RUN_CG, std::move(Queue), CommandBuffer, + Dependencies), MCommandGroup(std::move(CommandGroup)) { if (MCommandGroup->getType() == detail::CG::CodeplayHostTask) { MEvent->setSubmittedQueue( @@ -2176,7 +2182,7 @@ static void adjustNDRangePerKernel(NDRDescT &NDR, // Initially we keep the order of NDRDescT as it provided by the user, this // simplifies overall handling and do the reverse only when // the kernel is enqueued. -static void ReverseRangeDimensionsForKernel(NDRDescT &NDR) { +void ReverseRangeDimensionsForKernel(NDRDescT &NDR) { if (NDR.Dims > 1) { std::swap(NDR.GlobalSize[0], NDR.GlobalSize[NDR.Dims - 1]); std::swap(NDR.LocalSize[0], NDR.LocalSize[NDR.Dims - 1]); @@ -2196,6 +2202,89 @@ pi_mem_obj_access AccessModeToPi(access::mode AccessorMode) { } } +void SetArgBasedOnType( + const PluginPtr &Plugin, sycl::detail::pi::PiKernel Kernel, + const std::shared_ptr &DeviceImageImpl, + const std::function &getMemAllocationFunc, + const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg, + size_t NextTrueIndex) { + switch (Arg.MType) { + case kernel_param_kind_t::kind_stream: + break; + case kernel_param_kind_t::kind_accessor: { + Requirement *Req = (Requirement *)(Arg.MPtr); + assert(getMemAllocationFunc != nullptr && + "We should have caught this earlier."); + + sycl::detail::pi::PiMem MemArg = + (sycl::detail::pi::PiMem)getMemAllocationFunc(Req); + if (Context.get_backend() == backend::opencl) { + // clSetKernelArg (corresponding to piKernelSetArg) returns an error + // when MemArg is null, which is the case when zero-sized buffers are + // handled. Below assignment provides later call to clSetKernelArg with + // acceptable arguments. + if (!MemArg) + MemArg = sycl::detail::pi::PiMem(); + + Plugin->call( + Kernel, NextTrueIndex, sizeof(sycl::detail::pi::PiMem), &MemArg); + } else { + pi_mem_obj_property MemObjData{}; + MemObjData.mem_access = AccessModeToPi(Req->MAccessMode); + MemObjData.type = PI_KERNEL_ARG_MEM_OBJ_ACCESS; + Plugin->call(Kernel, NextTrueIndex, + &MemObjData, &MemArg); + } + break; + } + case kernel_param_kind_t::kind_std_layout: { + Plugin->call(Kernel, NextTrueIndex, Arg.MSize, + Arg.MPtr); + break; + } + case kernel_param_kind_t::kind_sampler: { + sampler *SamplerPtr = (sampler *)Arg.MPtr; + sycl::detail::pi::PiSampler Sampler = + detail::getSyclObjImpl(*SamplerPtr)->getOrCreateSampler(Context); + Plugin->call(Kernel, NextTrueIndex, + &Sampler); + break; + } + case kernel_param_kind_t::kind_pointer: { + Plugin->call(Kernel, NextTrueIndex, + Arg.MSize, Arg.MPtr); + break; + } + case kernel_param_kind_t::kind_specialization_constants_buffer: { + if (IsHost) { + throw sycl::exception( + sycl::make_error_code(sycl::errc::feature_not_supported), + "SYCL2020 specialization constants are not yet supported on host " + "device " + + codeToString(PI_ERROR_INVALID_OPERATION)); + } + assert(DeviceImageImpl != nullptr); + sycl::detail::pi::PiMem SpecConstsBuffer = + DeviceImageImpl->get_spec_const_buffer_ref(); + // Avoid taking an address of nullptr + sycl::detail::pi::PiMem *SpecConstsBufferArg = + SpecConstsBuffer ? &SpecConstsBuffer : nullptr; + + pi_mem_obj_property MemObjData{}; + MemObjData.mem_access = PI_ACCESS_READ_ONLY; + MemObjData.type = PI_KERNEL_ARG_MEM_OBJ_ACCESS; + Plugin->call( + Kernel, NextTrueIndex, &MemObjData, SpecConstsBufferArg); + break; + } + case kernel_param_kind_t::kind_invalid: + throw sycl::exception(sycl::make_error_code(sycl::errc::runtime), + "Invalid kernel param kind " + + codeToString(PI_ERROR_INVALID_VALUE)); + break; + } +} + static pi_result SetKernelParamsAndLaunch( const QueueImplPtr &Queue, std::vector &Args, const std::shared_ptr &DeviceImageImpl, @@ -2207,82 +2296,9 @@ static pi_result SetKernelParamsAndLaunch( auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc, &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) { - switch (Arg.MType) { - case kernel_param_kind_t::kind_stream: - break; - case kernel_param_kind_t::kind_accessor: { - Requirement *Req = (Requirement *)(Arg.MPtr); - assert(getMemAllocationFunc != nullptr && - "We should have caught this earlier."); - - sycl::detail::pi::PiMem MemArg = - (sycl::detail::pi::PiMem)getMemAllocationFunc(Req); - if (Queue->getDeviceImplPtr()->getBackend() == backend::opencl) { - // clSetKernelArg (corresponding to piKernelSetArg) returns an error - // when MemArg is null, which is the case when zero-sized buffers are - // handled. Below assignment provides later call to clSetKernelArg with - // acceptable arguments. - if (!MemArg) - MemArg = sycl::detail::pi::PiMem(); - - Plugin->call( - Kernel, NextTrueIndex, sizeof(sycl::detail::pi::PiMem), &MemArg); - } else { - pi_mem_obj_property MemObjData{}; - MemObjData.mem_access = AccessModeToPi(Req->MAccessMode); - MemObjData.type = PI_KERNEL_ARG_MEM_OBJ_ACCESS; - Plugin->call(Kernel, NextTrueIndex, - &MemObjData, &MemArg); - } - break; - } - case kernel_param_kind_t::kind_std_layout: { - Plugin->call(Kernel, NextTrueIndex, Arg.MSize, - Arg.MPtr); - break; - } - case kernel_param_kind_t::kind_sampler: { - sampler *SamplerPtr = (sampler *)Arg.MPtr; - sycl::detail::pi::PiSampler Sampler = - detail::getSyclObjImpl(*SamplerPtr) - ->getOrCreateSampler(Queue->get_context()); - Plugin->call(Kernel, NextTrueIndex, - &Sampler); - break; - } - case kernel_param_kind_t::kind_pointer: { - Plugin->call(Kernel, NextTrueIndex, - Arg.MSize, Arg.MPtr); - break; - } - case kernel_param_kind_t::kind_specialization_constants_buffer: { - if (Queue->is_host()) { - throw sycl::exception( - sycl::make_error_code(sycl::errc::feature_not_supported), - "SYCL2020 specialization constants are not yet supported on host " - "device " + - codeToString(PI_ERROR_INVALID_OPERATION)); - } - assert(DeviceImageImpl != nullptr); - sycl::detail::pi::PiMem SpecConstsBuffer = - DeviceImageImpl->get_spec_const_buffer_ref(); - // Avoid taking an address of nullptr - sycl::detail::pi::PiMem *SpecConstsBufferArg = - SpecConstsBuffer ? &SpecConstsBuffer : nullptr; - - pi_mem_obj_property MemObjData{}; - MemObjData.mem_access = PI_ACCESS_READ_ONLY; - MemObjData.type = PI_KERNEL_ARG_MEM_OBJ_ACCESS; - Plugin->call( - Kernel, NextTrueIndex, &MemObjData, SpecConstsBufferArg); - break; - } - case kernel_param_kind_t::kind_invalid: - throw sycl::exception(sycl::make_error_code(sycl::errc::runtime), - "Invalid kernel param kind " + - codeToString(PI_ERROR_INVALID_VALUE)); - break; - } + SetArgBasedOnType(Plugin, Kernel, DeviceImageImpl, getMemAllocationFunc, + Queue->get_context(), Queue->is_host(), Arg, + NextTrueIndex); }; applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc); @@ -2344,6 +2360,82 @@ void DispatchNativeKernel(void *Blob) { delete NDRDesc; } +pi_int32 enqueueImpCommandBufferKernel( + context Ctx, DeviceImplPtr DeviceImpl, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, + const CGExecKernel &CommandGroup, + std::vector &SyncPoints, + sycl::detail::pi::PiExtSyncPoint *OutSyncPoint, + const std::function &getMemAllocationFunc) { + auto ContextImpl = sycl::detail::getSyclObjImpl(Ctx); + const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin(); + pi_kernel PiKernel = nullptr; + std::mutex *KernelMutex = nullptr; + pi_program PiProgram = nullptr; + + auto Kernel = CommandGroup.MSyclKernel; + const KernelArgMask *EliminatedArgMask; + if (Kernel != nullptr) { + PiKernel = Kernel->getHandleRef(); + } else { + std::tie(PiKernel, KernelMutex, EliminatedArgMask, PiProgram) = + sycl::detail::ProgramManager::getInstance().getOrCreateKernel( + ContextImpl, DeviceImpl, CommandGroup.MKernelName, nullptr); + } + + auto SetFunc = [&Plugin, &PiKernel, &Ctx, &getMemAllocationFunc]( + sycl::detail::ArgDesc &Arg, size_t NextTrueIndex) { + sycl::detail::SetArgBasedOnType( + Plugin, PiKernel, + nullptr /* TODO: Handle spec constants and pass device image here */ + , + getMemAllocationFunc, Ctx, false, Arg, NextTrueIndex); + }; + // Copy args for modification + auto Args = CommandGroup.MArgs; + sycl::detail::applyFuncOnFilteredArgs(EliminatedArgMask, Args, SetFunc); + + // Remember this information before the range dimensions are reversed + const bool HasLocalSize = (CommandGroup.MNDRDesc.LocalSize[0] != 0); + + // Copy NDRDesc for modification + auto NDRDesc = CommandGroup.MNDRDesc; + // Reverse kernel dims + sycl::detail::ReverseRangeDimensionsForKernel(NDRDesc); + + size_t RequiredWGSize[3] = {0, 0, 0}; + size_t *LocalSize = nullptr; + + if (HasLocalSize) + LocalSize = &NDRDesc.LocalSize[0]; + else { + Plugin->call( + PiKernel, DeviceImpl->getHandleRef(), + PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(RequiredWGSize), + RequiredWGSize, + /* param_value_size_ret = */ nullptr); + + const bool EnforcedLocalSize = + (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 || + RequiredWGSize[2] != 0); + if (EnforcedLocalSize) + LocalSize = RequiredWGSize; + } + + pi_result Res = Plugin->call_nocheck< + sycl::detail::PiApiKind::piextCommandBufferNDRangeKernel>( + CommandBuffer, PiKernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0], + &NDRDesc.GlobalSize[0], LocalSize, SyncPoints.size(), + SyncPoints.size() ? SyncPoints.data() : nullptr, OutSyncPoint); + + if (Res != pi_result::PI_SUCCESS) { + throw sycl::exception(errc::invalid, + "Failed to add kernel to PI command-buffer"); + } + + return Res; +} + pi_int32 enqueueImpKernel( const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector &Args, const std::shared_ptr &KernelBundleImplPtr, @@ -2512,7 +2604,113 @@ enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName, return Error; } +pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { + std::vector EventImpls = MPreparedDepsEvents; + auto RawEvents = getPiEvents(EventImpls); + flushCrossQueueDeps(EventImpls, getWorkerQueue()); + + sycl::detail::pi::PiEvent *Event = + (MQueue->has_discard_events_support() && + MCommandGroup->getRequirements().size() == 0) + ? nullptr + : &MEvent->getHandleRef(); + sycl::detail::pi::PiExtSyncPoint OutSyncPoint; + switch (MCommandGroup->getType()) { + case CG::CGTYPE::Kernel: { + CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get(); + + auto getMemAllocationFunc = [this](Requirement *Req) { + AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); + return AllocaCmd->getMemAllocation(); + }; + + if (!Event) { + // Kernel only uses assert if it's non interop one + bool KernelUsesAssert = + !(ExecKernel->MSyclKernel && ExecKernel->MSyclKernel->isInterop()) && + ProgramManager::getInstance().kernelUsesAssert( + ExecKernel->MKernelName); + if (KernelUsesAssert) { + Event = &MEvent->getHandleRef(); + } + } + auto result = enqueueImpCommandBufferKernel( + MQueue->get_context(), MQueue->getDeviceImplPtr(), MCommandBuffer, + *ExecKernel, MSyncPointDeps, &OutSyncPoint, getMemAllocationFunc); + MEvent->setSyncPoint(OutSyncPoint); + return result; + } + case CG::CGTYPE::CopyUSM: { + CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get(); + MemoryManager::ext_oneapi_copy_usm_cmd_buffer( + MQueue->getContextImplPtr(), Copy->getSrc(), MCommandBuffer, + Copy->getLength(), Copy->getDst(), MSyncPointDeps, &OutSyncPoint); + MEvent->setSyncPoint(OutSyncPoint); + return PI_SUCCESS; + } + case CG::CGTYPE::CopyAccToAcc: { + CGCopy *Copy = (CGCopy *)MCommandGroup.get(); + Requirement *ReqSrc = (Requirement *)(Copy->getSrc()); + Requirement *ReqDst = (Requirement *)(Copy->getDst()); + + AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc); + AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst); + + MemoryManager::ext_oneapi_copyD2D_cmd_buffer( + MQueue->getContextImplPtr(), MCommandBuffer, + AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(), + ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange, + ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(), + ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange, + ReqDst->MOffset, ReqDst->MElemSize, std::move(MSyncPointDeps), + &OutSyncPoint); + MEvent->setSyncPoint(OutSyncPoint); + return PI_SUCCESS; + } + case CG::CGTYPE::CopyAccToPtr: { + CGCopy *Copy = (CGCopy *)MCommandGroup.get(); + Requirement *Req = (Requirement *)Copy->getSrc(); + AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); + + MemoryManager::ext_oneapi_copyD2H_cmd_buffer( + MQueue->getContextImplPtr(), MCommandBuffer, AllocaCmd->getSYCLMemObj(), + AllocaCmd->getMemAllocation(), Req->MDims, Req->MMemoryRange, + Req->MAccessRange, Req->MOffset, Req->MElemSize, (char *)Copy->getDst(), + Req->MDims, Req->MAccessRange, + /*DstOffset=*/{0, 0, 0}, Req->MElemSize, std::move(MSyncPointDeps), + &OutSyncPoint); + + return PI_SUCCESS; + } + case CG::CGTYPE::CopyPtrToAcc: { + CGCopy *Copy = (CGCopy *)MCommandGroup.get(); + Requirement *Req = (Requirement *)(Copy->getDst()); + AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); + + MemoryManager::ext_oneapi_copyH2D_cmd_buffer( + MQueue->getContextImplPtr(), MCommandBuffer, AllocaCmd->getSYCLMemObj(), + (char *)Copy->getSrc(), Req->MDims, Req->MAccessRange, + /*SrcOffset*/ {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(), + Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset, + Req->MElemSize, std::move(MSyncPointDeps), &OutSyncPoint); + + return PI_SUCCESS; + } + default: + throw runtime_error("CG type not implemented for command buffers.", + PI_ERROR_INVALID_OPERATION); + } +} + pi_int32 ExecCGCommand::enqueueImp() { + if (MCommandBuffer) { + return enqueueImpCommandBuffer(); + } else { + return enqueueImpQueue(); + } +} + +pi_int32 ExecCGCommand::enqueueImpQueue() { if (getCG().getType() != CG::CGTYPE::CodeplayHostTask) waitForPreparedHostEvents(); std::vector EventImpls = MPreparedDepsEvents; @@ -2831,7 +3029,13 @@ pi_int32 ExecCGCommand::enqueueImp() { typeSize, RawEvents, Event, read); } case CG::CGTYPE::ExecCommandBuffer: { - throw runtime_error("CG type not implemented.", PI_ERROR_INVALID_OPERATION); + CGExecCommandBuffer *CmdBufferCG = + static_cast(MCommandGroup.get()); + return MQueue->getPlugin() + ->call_nocheck( + CmdBufferCG->MCommandBuffer, MQueue->getHandleRef(), + RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], + Event); } case CG::CGTYPE::None: throw sycl::exception(sycl::make_error_code(sycl::errc::runtime), @@ -2842,7 +3046,8 @@ pi_int32 ExecCGCommand::enqueueImp() { } bool ExecCGCommand::producesPiEvent() const { - return MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask; + return !MCommandBuffer && + MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask; } bool ExecCGCommand::supportsPostEnqueueCleanup() const { diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index f23f7ac18e12b..d444dfb6dc5df 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -109,10 +109,13 @@ class Command { UPDATE_REQUIREMENT, EMPTY_TASK, HOST_TASK, - FUSION + FUSION, + EXEC_CMD_BUFFER, }; - Command(CommandType Type, QueueImplPtr Queue); + Command(CommandType Type, QueueImplPtr Queue, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer = nullptr, + const std::vector &SyncPoints = {}); /// \param NewDep dependency to be added /// \param ToCleanUp container for commands that can be cleaned up. @@ -383,6 +386,18 @@ class Command { /// intersect with command enqueue. std::vector MBlockedUsers; std::mutex MBlockedUsersMutex; + +protected: + /// Gets the command buffer (if any) associated with this command. + sycl::detail::pi::PiExtCommandBuffer getCommandBuffer() const { + return MCommandBuffer; + } + + /// CommandBuffer which will be used to submit to instead of the queue, if + /// set. + sycl::detail::pi::PiExtCommandBuffer MCommandBuffer; + /// List of sync points for submissions to a command buffer. + std::vector MSyncPointDeps; }; /// The empty command does nothing during enqueue. The task can be used to @@ -619,7 +634,10 @@ class KernelFusionCommand; /// operation. class ExecCGCommand : public Command { public: - ExecCGCommand(std::unique_ptr CommandGroup, QueueImplPtr Queue); + ExecCGCommand( + std::unique_ptr CommandGroup, QueueImplPtr Queue, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer = nullptr, + const std::vector &Dependencies = {}); std::vector> getAuxiliaryResources() const; @@ -649,6 +667,8 @@ class ExecCGCommand : public Command { private: pi_int32 enqueueImp() final; + pi_int32 enqueueImpCommandBuffer(); + pi_int32 enqueueImpQueue(); AllocaCommandBase *getAllocaForReq(Requirement *Req); @@ -721,6 +741,31 @@ class KernelFusionCommand : public Command { FusionStatus MStatus; }; +// Enqueues a given kernel to a PiExtCommandBuffer +pi_int32 enqueueImpCommandBufferKernel( + context Ctx, DeviceImplPtr DeviceImpl, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, + const CGExecKernel &CommandGroup, + std::vector &SyncPoints, + sycl::detail::pi::PiExtSyncPoint *OutSyncPoint, + const std::function &getMemAllocationFunc); + +// Sets arguments for a given kernel and device based on the argument type. +// Refactored from SetKernelParamsAndLaunch to allow it to be used in the graphs +// extension. +void SetArgBasedOnType( + const detail::plugin &Plugin, sycl::detail::pi::PiKernel Kernel, + const std::shared_ptr &DeviceImageImpl, + const std::function &getMemAllocationFunc, + const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg, + size_t NextTrueIndex); + +void applyFuncOnFilteredArgs( + const KernelArgMask *EliminatedArgMask, std::vector &Args, + std::function Func); + +void ReverseRangeDimensionsForKernel(NDRDescT &NDR); + } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index aee61a1dcb7b8..3e4dc1d862195 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -925,14 +925,17 @@ static void combineAccessModesOfReqs(std::vector &Reqs) { } } -Scheduler::GraphBuildResult -Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, - const QueueImplPtr &Queue, - std::vector &ToEnqueue) { +Scheduler::GraphBuildResult Scheduler::GraphBuilder::addCG( + std::unique_ptr CommandGroup, const QueueImplPtr &Queue, + std::vector &ToEnqueue, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, + const std::vector &Dependencies) { std::vector &Reqs = CommandGroup->getRequirements(); std::vector &Events = CommandGroup->getEvents(); - auto NewCmd = std::make_unique(std::move(CommandGroup), Queue); + auto NewCmd = std::make_unique( + std::move(CommandGroup), Queue, CommandBuffer, std::move(Dependencies)); + if (!NewCmd) throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY); diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index ee75f8767f7ae..17a4936c0a398 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -87,8 +87,10 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, } } -EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, - const QueueImplPtr &Queue) { +EventImplPtr Scheduler::addCG( + std::unique_ptr CommandGroup, const QueueImplPtr &Queue, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, + const std::vector &Dependencies) { EventImplPtr NewEvent = nullptr; const CG::CGTYPE Type = CommandGroup->getType(); std::vector AuxiliaryCmds; @@ -132,7 +134,9 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, } default: auto Result = MGraphBuilder.addCG(std::move(CommandGroup), - std::move(Queue), AuxiliaryCmds); + std::move(Queue), AuxiliaryCmds, + CommandBuffer, std::move(Dependencies)); + NewCmd = Result.NewCmd; NewEvent = Result.NewEvent; ShouldEnqueue = Result.ShouldEnqueue; diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 2d12fc147783d..de16bb389c754 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -367,9 +367,16 @@ class Scheduler { /// It's called by SYCL's queue.submit. /// /// \param CommandGroup is a unique_ptr to a command group to be added. + /// \param Queue Queue that is registering the command-group. + /// \param CommandBuffer Optional command buffer to enqueue to instead of + /// directly to the queue. + /// \param Dependencies Optional list of dependency + /// sync points when enqueuing to a command buffer. /// \return an event object to wait on for command group completion. - EventImplPtr addCG(std::unique_ptr CommandGroup, - const QueueImplPtr &Queue); + EventImplPtr + addCG(std::unique_ptr CommandGroup, const QueueImplPtr &Queue, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer = nullptr, + const std::vector &Dependencies = {}); /// Registers a command group, that copies most recent memory to the memory /// pointed by the requirement. @@ -526,13 +533,19 @@ class Scheduler { /// Registers \ref CG "command group" and adds it to the dependency graph. /// /// \sa queue::submit, Scheduler::addCG + /// \param CommandBuffer Optional command buffer to enqueue to instead of + /// directly to the queue. + /// \param Dependencies Optional list of dependency + /// sync points when enqueuing to a command buffer. /// /// \return a command that represents command group execution and a bool /// indicating whether this command should be enqueued to the graph /// processor right away or not. - GraphBuildResult addCG(std::unique_ptr CommandGroup, - const QueueImplPtr &Queue, - std::vector &ToEnqueue); + GraphBuildResult addCG( + std::unique_ptr CommandGroup, const QueueImplPtr &Queue, + std::vector &ToEnqueue, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer = nullptr, + const std::vector &Dependencies = {}); /// Registers a \ref CG "command group" that updates host memory to the /// latest state. diff --git a/sycl/source/feature_test.hpp.in b/sycl/source/feature_test.hpp.in index 6752728771626..dee98b0e8e2a3 100644 --- a/sycl/source/feature_test.hpp.in +++ b/sycl/source/feature_test.hpp.in @@ -81,6 +81,7 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) { #define SYCL_EXT_CODEPLAY_KERNEL_FUSION 1 #endif #define SYCL_EXT_INTEL_CACHE_CONFIG 1 +#define SYCL_EXT_ONEAPI_GRAPH 1 #define SYCL_EXT_CODEPLAY_MAX_REGISTERS_PER_WORK_GROUP_QUERY 1 #define SYCL_EXT_ONEAPI_DEVICE_GLOBAL 1 #define SYCL_EXT_INTEL_QUEUE_IMMEDIATE_COMMAND_LIST 1 diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 8ab96f11aeddc..603efada0cec7 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -367,7 +367,8 @@ event handler::finalize() { // If we have a subgraph node we don't want to actually execute this command // graph submission. if (!MSubgraphNode) { - event GraphCompletionEvent = MExecGraph->enqueue(MQueue); + event GraphCompletionEvent = + MExecGraph->enqueue(MQueue, std::move(CGData)); MLastEvent = GraphCompletionEvent; return MLastEvent; } @@ -376,9 +377,19 @@ event handler::finalize() { if (detail::pi::trace(detail::pi::TraceLevel::PI_TRACE_ALL)) { std::cout << "WARNING: An empty command group is submitted." << std::endl; } - detail::EventImplPtr Event = std::make_shared(); - MLastEvent = detail::createSyclObjFromImpl(Event); - return MLastEvent; + + // Empty nodes are handled by Graph like standard nodes + // For Standard mode (non-graph), + // empty nodes are not sent to the scheduler to save time + if (MGraph || MQueue->getCommandGraph()) { + CommandGroup.reset( + new detail::CG(detail::CG::None, std::move(CGData), MCodeLoc)); + } else { + detail::EventImplPtr Event = std::make_shared(); + MLastEvent = detail::createSyclObjFromImpl(Event); + return MLastEvent; + } + break; } if (!MSubgraphNode && !CommandGroup) @@ -399,15 +410,34 @@ event handler::finalize() { // it to the graph to create a node, rather than submit it to the scheduler. if (auto GraphImpl = MQueue->getCommandGraph(); GraphImpl) { auto EventImpl = std::make_shared(); - - // Extract relevant data from the handler and pass to graph to create a - // new node representing this command group. std::shared_ptr NodeImpl = - GraphImpl->add(MCGType, std::move(CommandGroup)); + nullptr; + + // Create a new node in the graph representing this command-group + if (MQueue->isInOrder()) { + // In-order queues create implicit linear dependencies between nodes. + // Find the last node added to the graph from this queue, so our new + // node can set it as a predecessor. + auto DependentNode = GraphImpl->getLastInorderNode(MQueue); + + NodeImpl = DependentNode + ? GraphImpl->add(MCGType, std::move(CommandGroup), + {DependentNode}) + : GraphImpl->add(MCGType, std::move(CommandGroup)); + + // If we are recording an in-order queue remember the new node, so it + // can be used as a dependency for any more nodes recorded from this + // queue. + GraphImpl->setLastInorderNode(MQueue, NodeImpl); + } else { + NodeImpl = GraphImpl->add(MCGType, std::move(CommandGroup)); + } // Associate an event with this new node and return the event. GraphImpl->addEventForNode(EventImpl, NodeImpl); + EventImpl->setCommandGraph(GraphImpl); + return detail::createSyclObjFromImpl(EventImpl); } @@ -842,18 +872,25 @@ void handler::depends_on(event Event) { throw sycl::exception(make_error_code(errc::invalid), "Queue operation cannot depend on discarded event."); } + if (auto Graph = getCommandGraph(); Graph) { + auto EventGraph = EventImpl->getCommandGraph(); + if (EventGraph == nullptr) { + throw sycl::exception( + make_error_code(errc::invalid), + "Graph nodes cannot depend on events from outside the graph."); + } + if (EventGraph != Graph) { + throw sycl::exception( + make_error_code(errc::invalid), + "Graph nodes cannot depend on events from another graph."); + } + } CGData.MEvents.push_back(EventImpl); } void handler::depends_on(const std::vector &Events) { for (const event &Event : Events) { - auto EventImpl = detail::getSyclObjImpl(Event); - if (EventImpl->isDiscarded()) { - throw sycl::exception( - make_error_code(errc::invalid), - "Queue operation cannot depend on discarded event."); - } - CGData.MEvents.push_back(EventImpl); + depends_on(Event); } } @@ -1020,8 +1057,15 @@ void handler::ext_oneapi_graph( // Store the node representing the subgraph in the handler so that we can // return it to the user later. MSubgraphNode = ParentGraph->addSubgraphNodes(GraphImpl->getSchedule()); + + // If we are recording an in-order queue remember the subgraph node, so it + // can be used as a dependency for any more nodes recorded from this queue. + if (MQueue && MQueue->isInOrder()) { + ParentGraph->setLastInorderNode(MQueue, MSubgraphNode); + } // Associate an event with the subgraph node. auto SubgraphEvent = std::make_shared(); + SubgraphEvent->setCommandGraph(ParentGraph); ParentGraph->addEventForNode(SubgraphEvent, MSubgraphNode); } else { // Set the exec graph for execution during finalize. @@ -1029,5 +1073,13 @@ void handler::ext_oneapi_graph( } } +std::shared_ptr +handler::getCommandGraph() const { + if (MGraph) { + return MGraph; + } + return MQueue->getCommandGraph(); +} + } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index ed2c2c402b006..7b69e5bfe3001 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -76,6 +76,12 @@ context queue::get_context() const { return impl->get_context(); } device queue::get_device() const { return impl->get_device(); } +ext::oneapi::experimental::queue_state queue::ext_oneapi_get_state() const { + return impl->getCommandGraph() + ? ext::oneapi::experimental::queue_state::recording + : ext::oneapi::experimental::queue_state::executing; +} + bool queue::is_host() const { bool IsHost = impl->is_host(); assert(!IsHost && "queue::is_host should not be called in implementation."); diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index da37d95440a93..87974d7a256c4 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3851,6 +3851,10 @@ _ZN4sycl3_V16detail13MemoryManager20allocateMemSubBufferESt10shared_ptrINS1_12co _ZN4sycl3_V16detail13MemoryManager21copy_to_device_globalEPKvbSt10shared_ptrINS1_10queue_implEEmmS4_RKSt6vectorIP9_pi_eventSaISA_EEPSA_ _ZN4sycl3_V16detail13MemoryManager23copy_from_device_globalEPKvbSt10shared_ptrINS1_10queue_implEEmmPvRKSt6vectorIP9_pi_eventSaISB_EEPSB_ _ZN4sycl3_V16detail13MemoryManager24allocateInteropMemObjectESt10shared_ptrINS1_12context_implEEPvRKS3_INS1_10event_implEERKS5_RKNS0_13property_listERP9_pi_event +_ZN4sycl3_V16detail13MemoryManager29ext_oneapi_copyD2H_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPNS1_11SYCLMemObjIEPvjNS0_5rangeILi3EEESC_NS0_2idILi3EEEjPcjSC_SE_jSt6vectorIjSaIjEEPj +_ZN4sycl3_V16detail13MemoryManager29ext_oneapi_copyH2D_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPNS1_11SYCLMemObjIEPcjNS0_5rangeILi3EEENS0_2idILi3EEEjPvjSC_SC_SE_jSt6vectorIjSaIjEEPj +_ZN4sycl3_V16detail13MemoryManager29ext_oneapi_copyD2D_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPNS1_11SYCLMemObjIEPvjNS0_5rangeILi3EEESC_NS0_2idILi3EEEjSA_jSC_SC_SE_jSt6vectorIjSaIjEEPj +_ZN4sycl3_V16detail13MemoryManager30ext_oneapi_copy_usm_cmd_bufferESt10shared_ptrINS1_12context_implEEPKvP22_pi_ext_command_buffermPvSt6vectorIjSaIjEEPj _ZN4sycl3_V16detail13MemoryManager3mapEPNS1_11SYCLMemObjIEPvSt10shared_ptrINS1_10queue_implEENS0_6access4modeEjNS0_5rangeILi3EEESC_NS0_2idILi3EEEjSt6vectorIP9_pi_eventSaISH_EERSH_ _ZN4sycl3_V16detail13MemoryManager4copyEPNS1_11SYCLMemObjIEPvSt10shared_ptrINS1_10queue_implEEjNS0_5rangeILi3EEESA_NS0_2idILi3EEEjS5_S8_jSA_SA_SC_jSt6vectorIP9_pi_eventSaISF_EERSF_ _ZN4sycl3_V16detail13MemoryManager4fillEPNS1_11SYCLMemObjIEPvSt10shared_ptrINS1_10queue_implEEmPKcjNS0_5rangeILi3EEESC_NS0_2idILi3EEEjSt6vectorIP9_pi_eventSaISH_EERSH_ @@ -4155,6 +4159,7 @@ _ZNK4sycl3_V15queue12has_propertyINS0_8property5queue16enable_profilingEEEbv _ZNK4sycl3_V15queue12has_propertyINS0_8property5queue4cuda18use_default_streamEEEbv _ZNK4sycl3_V15queue12has_propertyINS0_8property5queue8in_orderEEEbv _ZNK4sycl3_V15queue16ext_oneapi_emptyEv +_ZNK4sycl3_V15queue20ext_oneapi_get_stateEv _ZNK4sycl3_V15queue28ext_codeplay_supports_fusionEv _ZNK4sycl3_V15queue3getEv _ZNK4sycl3_V15queue7is_hostEv @@ -4184,6 +4189,7 @@ _ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device25gpu_eu_co _ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device25max_compute_queue_indicesEEENT_11return_typeEv _ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device4uuidEEENT_11return_typeEv _ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext5intel4info6device9device_idEEENT_11return_typeEv +_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device13graph_supportEEENT_11return_typeEv _ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device12architectureEEENT_11return_typeEv _ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device15max_work_groupsILi1EEEEENT_11return_typeEv _ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi12experimental4info6device15max_work_groupsILi2EEEEENT_11return_typeEv @@ -4356,6 +4362,7 @@ _ZNK4sycl3_V16device8get_infoINS0_3ext5intel4info6device25gpu_eu_count_per_subsl _ZNK4sycl3_V16device8get_infoINS0_3ext5intel4info6device25max_compute_queue_indicesEEENS0_6detail19is_device_info_descIT_E11return_typeEv _ZNK4sycl3_V16device8get_infoINS0_3ext5intel4info6device4uuidEEENS0_6detail19is_device_info_descIT_E11return_typeEv _ZNK4sycl3_V16device8get_infoINS0_3ext5intel4info6device9device_idEEENS0_6detail19is_device_info_descIT_E11return_typeEv +_ZNK4sycl3_V16device8get_infoINS0_3ext6oneapi12experimental4info6device13graph_supportEEENS0_6detail19is_device_info_descIT_E11return_typeEv _ZNK4sycl3_V16device8get_infoINS0_3ext6oneapi12experimental4info6device12architectureEEENS0_6detail19is_device_info_descIT_E11return_typeEv _ZNK4sycl3_V16device8get_infoINS0_3ext6oneapi12experimental4info6device15max_work_groupsILi1EEEEENS0_6detail19is_device_info_descIT_E11return_typeEv _ZNK4sycl3_V16device8get_infoINS0_3ext6oneapi12experimental4info6device15max_work_groupsILi2EEEEENS0_6detail19is_device_info_descIT_E11return_typeEv @@ -4583,6 +4590,7 @@ _ZNK4sycl3_V17context8get_infoINS0_4info7context32atomic_memory_scope_capabiliti _ZNK4sycl3_V17context8get_infoINS0_4info7context7devicesEEENS0_6detail20is_context_info_descIT_E11return_typeEv _ZNK4sycl3_V17context8get_infoINS0_4info7context8platformEEENS0_6detail20is_context_info_descIT_E11return_typeEv _ZNK4sycl3_V17context9getNativeEv +_ZNK4sycl3_V17handler15getCommandGraphEv _ZNK4sycl3_V17handler17getContextImplPtrEv _ZNK4sycl3_V17handler27isStateExplicitKernelBundleEv _ZNK4sycl3_V17handler30getOrInsertHandlerKernelBundleEb diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index ce0094598eb4f..0729e804a003b 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -99,6 +99,8 @@ ??$get_info@Ugpu_slices@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBAIXZ ??$get_info@Ugpu_subslices_per_slice@device@info@intel@ext@_V1@sycl@@@device@_V1@sycl@@QEBAIXZ ??$get_info@Ugpu_subslices_per_slice@device@info@intel@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBAIXZ +??$get_info@Ugraph_support@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@QEBA?AW4graph_support_level@info@experimental@oneapi@ext@12@XZ +??$get_info@Ugraph_support@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AW4graph_support_level@info@experimental@oneapi@ext@23@XZ ??$get_info@Uhalf_fp_config@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA?AV?$vector@W4fp_config@info@_V1@sycl@@V?$allocator@W4fp_config@info@_V1@sycl@@@std@@@std@@XZ ??$get_info@Uhost_unified_memory@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA_NXZ ??$get_info@Uimage2d_max_height@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA_KXZ @@ -943,11 +945,16 @@ ?ext_oneapi_barrier@handler@_V1@sycl@@QEAAXAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@@Z ?ext_oneapi_barrier@handler@_V1@sycl@@QEAAXXZ ?ext_oneapi_can_access_peer@device@_V1@sycl@@QEAA_NAEBV123@W4peer_access@oneapi@ext@23@@Z +?ext_oneapi_copyD2D_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEAVSYCLMemObjI@234@PEAXIV?$range@$02@34@4V?$id@$02@34@I3I445IV?$vector@IV?$allocator@I@std@@@6@PEAI@Z +?ext_oneapi_copyD2H_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEAVSYCLMemObjI@234@PEAXIV?$range@$02@34@4V?$id@$02@34@IPEADI45IV?$vector@IV?$allocator@I@std@@@6@PEAI@Z +?ext_oneapi_copyH2D_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEAVSYCLMemObjI@234@PEADIV?$range@$02@34@V?$id@$02@34@IPEAXI445IV?$vector@IV?$allocator@I@std@@@6@PEAI@Z +?ext_oneapi_copy_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEBXPEAU_pi_ext_command_buffer@@_KPEAXV?$vector@IV?$allocator@I@std@@@6@PEAI@Z ?ext_oneapi_disable_peer_access@device@_V1@sycl@@QEAAXAEBV123@@Z ?ext_oneapi_empty@queue@_V1@sycl@@QEBA_NXZ ?ext_oneapi_enable_peer_access@device@_V1@sycl@@QEAAXAEBV123@@Z ?ext_oneapi_fill2d_impl@handler@_V1@sycl@@AEAAXPEAX_KPEBX111@Z ?ext_oneapi_get_default_context@platform@_V1@sycl@@QEBA?AVcontext@23@XZ +?ext_oneapi_get_state@queue@_V1@sycl@@QEBA?AW4queue_state@experimental@oneapi@ext@23@XZ ?ext_oneapi_graph@handler@_V1@sycl@@QEAAXV?$command_graph@$00@experimental@oneapi@ext@23@@Z ?ext_oneapi_graph@queue@_V1@sycl@@QEAA?AVevent@23@V?$command_graph@$00@experimental@oneapi@ext@23@AEBUcode_location@detail@23@@Z ?ext_oneapi_graph@queue@_V1@sycl@@QEAA?AVevent@23@V?$command_graph@$00@experimental@oneapi@ext@23@AEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z @@ -1006,6 +1013,7 @@ ?getChannelType@UnsampledImageAccessorBaseHost@detail@_V1@sycl@@QEBA?AW4image_channel_type@34@XZ ?getChannelType@image_impl@detail@_V1@sycl@@QEBA?AW4image_channel_type@34@XZ ?getChannelType@image_plain@detail@_V1@sycl@@IEBA?AW4image_channel_type@34@XZ +?getCommandGraph@handler@_V1@sycl@@AEBA?AV?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@XZ ?getContextImplPtr@handler@_V1@sycl@@AEBAAEBV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@XZ ?getCurrentDSODir@OSUtil@detail@_V1@sycl@@SA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ ?getDeviceFromHandler@detail@_V1@sycl@@YA?AVdevice@23@AEAVhandler@23@@Z diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index bb876f14484ed..1193439f196cc 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -40,54 +40,121 @@ class CommandGraphTest : public ::testing::Test { TEST_F(CommandGraphTest, AddNode) { auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); - ASSERT_TRUE(GraphImpl->MRoots.size() == 0); + ASSERT_TRUE(GraphImpl->MRoots.empty()); - auto Node1 = Graph.add([&](sycl::handler &cgh) {}); - - ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1) != nullptr); - ASSERT_TRUE(GraphImpl->MRoots.size() == 1); - ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size() == 0); + auto Node1 = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + ASSERT_NE(sycl::detail::getSyclObjImpl(Node1), nullptr); + ASSERT_FALSE(sycl::detail::getSyclObjImpl(Node1)->isEmpty()); + ASSERT_EQ(GraphImpl->MRoots.size(), 1lu); + ASSERT_EQ(*GraphImpl->MRoots.begin(), sycl::detail::getSyclObjImpl(Node1)); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.empty()); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MPredecessors.empty()); // Add a node which depends on the first - auto Node2 = Graph.add([&](sycl::handler &cgh) {}, - {experimental::property::node::depends_on(Node1)}); - ASSERT_TRUE(GraphImpl->MRoots.size() == 1); - ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size() == 1); - ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.front() == - sycl::detail::getSyclObjImpl(Node2)); + auto Node2Deps = experimental::property::node::depends_on(Node1); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2Deps.get_dependencies().front()), + sycl::detail::getSyclObjImpl(Node1)); + auto Node2 = Graph.add([&](sycl::handler &cgh) {}, {Node2Deps}); + ASSERT_NE(sycl::detail::getSyclObjImpl(Node2), nullptr); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2)->isEmpty()); + ASSERT_EQ(GraphImpl->MRoots.size(), 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size(), 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.front(), + sycl::detail::getSyclObjImpl(Node2)); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MPredecessors.empty()); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2)->MPredecessors.size(), 1lu); // Add a third node which depends on both auto Node3 = Graph.add([&](sycl::handler &cgh) {}, {experimental::property::node::depends_on(Node1, Node2)}); - ASSERT_TRUE(GraphImpl->MRoots.size() == 1); - ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size() == 2); - ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2)->MSuccessors.size() == 1); + ASSERT_NE(sycl::detail::getSyclObjImpl(Node3), nullptr); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node3)->isEmpty()); + ASSERT_EQ(GraphImpl->MRoots.size(), 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size(), 2lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2)->MSuccessors.size(), 1lu); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MPredecessors.empty()); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2)->MPredecessors.size(), 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node3)->MPredecessors.size(), 2lu); // Add a fourth node without any dependencies on the others auto Node4 = Graph.add([&](sycl::handler &cgh) {}); - ASSERT_TRUE(GraphImpl->MRoots.size() == 2); - ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size() == 2); - ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2)->MSuccessors.size() == 1); - ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node3)->MSuccessors.size() == 0); + ASSERT_NE(sycl::detail::getSyclObjImpl(Node4), nullptr); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node4)->isEmpty()); + ASSERT_EQ(GraphImpl->MRoots.size(), 2lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size(), 2lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2)->MSuccessors.size(), 1lu); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node3)->MSuccessors.empty()); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MPredecessors.empty()); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2)->MPredecessors.size(), 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node3)->MPredecessors.size(), 2lu); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node4)->MPredecessors.empty()); +} + +TEST_F(CommandGraphTest, Finalize) { + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + + sycl::buffer Buf(1); + auto Node1 = Graph.add([&](sycl::handler &cgh) { + sycl::accessor A(Buf, cgh, sycl::write_only, sycl::no_init); + cgh.single_task([=]() { A[0] = 1; }); + }); + + // Add independent node + auto Node2 = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + + // Add a node that depends on Node1 due to the accessor + auto Node3 = Graph.add([&](sycl::handler &cgh) { + sycl::accessor A(Buf, cgh, sycl::write_only, sycl::no_init); + cgh.single_task([=]() { A[0] = 3; }); + }); + + // Guarantee order of independent nodes 1 and 2 + Graph.make_edge(Node2, Node1); + + auto GraphExec = Graph.finalize(); + auto GraphExecImpl = sycl::detail::getSyclObjImpl(GraphExec); + + // The final schedule should contain three nodes in order: 2->1->3 + auto Schedule = GraphExecImpl->getSchedule(); + ASSERT_EQ(Schedule.size(), 3ul); + auto ScheduleIt = Schedule.begin(); + ASSERT_EQ(*ScheduleIt, sycl::detail::getSyclObjImpl(Node2)); + ScheduleIt++; + ASSERT_EQ(*ScheduleIt, sycl::detail::getSyclObjImpl(Node1)); + ScheduleIt++; + ASSERT_EQ(*ScheduleIt, sycl::detail::getSyclObjImpl(Node3)); + ASSERT_EQ(Queue.get_context(), GraphExecImpl->getContext()); } TEST_F(CommandGraphTest, MakeEdge) { auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); - auto Node1 = Graph.add([&](sycl::handler &cgh) {}); + // Add two independent nodes + auto Node1 = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); auto Node2 = Graph.add([&](sycl::handler &cgh) {}); - ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size() == 0); - ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2)->MPredecessors.size() == 0); + ASSERT_EQ(GraphImpl->MRoots.size(), 2ul); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.empty()); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MPredecessors.empty()); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2)->MSuccessors.empty()); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2)->MPredecessors.empty()); + // Connect nodes and verify order Graph.make_edge(Node1, Node2); - - ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size() == 1); - ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2)->MPredecessors.size() == 1); + ASSERT_EQ(GraphImpl->MRoots.size(), 1ul); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size(), 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.front(), + sycl::detail::getSyclObjImpl(Node2)); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MPredecessors.empty()); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2)->MSuccessors.empty()); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2)->MPredecessors.size(), 1lu); } TEST_F(CommandGraphTest, BeginEndRecording) { - sycl::queue Queue2{Dev}; + sycl::queue Queue2{Queue.get_context(), Dev}; // Test throwing behaviour // Check we can repeatedly begin recording on the same queues @@ -157,3 +224,371 @@ TEST_F(CommandGraphTest, BeginEndRecording) { // Vector end should still return true as Queue will have state changed ASSERT_TRUE(Graph.end_recording({Queue, Queue2})); } + +TEST_F(CommandGraphTest, GetCGCopy) { + auto Node1 = Graph.add([&](sycl::handler &cgh) {}); + auto Node2 = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }, + {experimental::property::node::depends_on(Node1)}); + + // Get copy of CG of Node2 and check equality + auto Node2Imp = sycl::detail::getSyclObjImpl(Node2); + auto Node2CGCopy = Node2Imp->getCGCopy(); + ASSERT_EQ(Node2CGCopy->getType(), Node2Imp->MCGType); + ASSERT_EQ(Node2CGCopy->getType(), sycl::detail::CG::Kernel); + ASSERT_EQ(Node2CGCopy->getType(), Node2Imp->MCommandGroup->getType()); + ASSERT_EQ(Node2CGCopy->getAccStorage(), + Node2Imp->MCommandGroup->getAccStorage()); + ASSERT_EQ(Node2CGCopy->getArgsStorage(), + Node2Imp->MCommandGroup->getArgsStorage()); + ASSERT_EQ(Node2CGCopy->getEvents(), Node2Imp->MCommandGroup->getEvents()); + ASSERT_EQ(Node2CGCopy->getRequirements(), + Node2Imp->MCommandGroup->getRequirements()); + ASSERT_EQ(Node2CGCopy->getSharedPtrStorage(), + Node2Imp->MCommandGroup->getSharedPtrStorage()); +} +TEST_F(CommandGraphTest, SubGraph) { + // Add sub-graph with two nodes + auto Node1Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + auto Node2Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }, + {experimental::property::node::depends_on(Node1Graph)}); + auto GraphExec = Graph.finalize(); + + // Add node to main graph followed by sub-graph and another node + experimental::command_graph MainGraph(Queue.get_context(), Dev); + auto Node1MainGraph = MainGraph.add( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + auto Node2MainGraph = + MainGraph.add([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }, + {experimental::property::node::depends_on(Node1MainGraph)}); + auto Node3MainGraph = MainGraph.add( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }, + {experimental::property::node::depends_on(Node2MainGraph)}); + + // Assert order of the added sub-graph + ASSERT_NE(sycl::detail::getSyclObjImpl(Node2MainGraph), nullptr); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2MainGraph)->isEmpty()); + ASSERT_EQ(sycl::detail::getSyclObjImpl(MainGraph)->MRoots.size(), 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1MainGraph)->MSuccessors.size(), + 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1MainGraph)->MSuccessors.front(), + sycl::detail::getSyclObjImpl(Node1Graph)); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2MainGraph)->MSuccessors.size(), + 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1MainGraph)->MPredecessors.size(), + 0lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2MainGraph)->MPredecessors.size(), + 1lu); + + // Finalize main graph and check schedule + auto MainGraphExec = MainGraph.finalize(); + auto MainGraphExecImpl = sycl::detail::getSyclObjImpl(MainGraphExec); + auto Schedule = MainGraphExecImpl->getSchedule(); + auto ScheduleIt = Schedule.begin(); + ASSERT_EQ(Schedule.size(), 4ul); + ASSERT_EQ(*ScheduleIt, sycl::detail::getSyclObjImpl(Node1MainGraph)); + ScheduleIt++; + ASSERT_EQ(*ScheduleIt, sycl::detail::getSyclObjImpl(Node1Graph)); + ScheduleIt++; + ASSERT_EQ(*ScheduleIt, sycl::detail::getSyclObjImpl(Node2Graph)); + ScheduleIt++; + ASSERT_EQ(*ScheduleIt, sycl::detail::getSyclObjImpl(Node3MainGraph)); + ASSERT_EQ(Queue.get_context(), MainGraphExecImpl->getContext()); +} + +TEST_F(CommandGraphTest, RecordSubGraph) { + // Record sub-graph with two nodes + Graph.begin_recording(Queue); + auto Node1Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + auto Node2Graph = Queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(Node1Graph); + cgh.single_task([]() {}); + }); + Graph.end_recording(Queue); + auto GraphExec = Graph.finalize(); + + // Add node to main graph followed by sub-graph and another node + experimental::command_graph MainGraph(Queue.get_context(), Dev); + MainGraph.begin_recording(Queue); + auto Node1MainGraph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + auto Node2MainGraph = Queue.submit([&](handler &cgh) { + cgh.depends_on(Node1MainGraph); + cgh.ext_oneapi_graph(GraphExec); + }); + auto Node3MainGraph = Queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(Node2MainGraph); + cgh.single_task([]() {}); + }); + MainGraph.end_recording(Queue); + + // Finalize main graph and check schedule + auto MainGraphExec = MainGraph.finalize(); + auto MainGraphExecImpl = sycl::detail::getSyclObjImpl(MainGraphExec); + auto Schedule = MainGraphExecImpl->getSchedule(); + auto ScheduleIt = Schedule.begin(); + ASSERT_EQ(Schedule.size(), 4ul); + + // The first and fourth nodes should have events associated with MainGraph but + // not graph. The second and third nodes were added as a sub-graph and should + // have events associated with Graph but not MainGraph. + ASSERT_ANY_THROW( + sycl::detail::getSyclObjImpl(Graph)->getEventForNode(*ScheduleIt)); + ASSERT_EQ( + sycl::detail::getSyclObjImpl(MainGraph)->getEventForNode(*ScheduleIt), + sycl::detail::getSyclObjImpl(Node1MainGraph)); + + ScheduleIt++; + ASSERT_ANY_THROW( + sycl::detail::getSyclObjImpl(MainGraph)->getEventForNode(*ScheduleIt)); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Graph)->getEventForNode(*ScheduleIt), + sycl::detail::getSyclObjImpl(Node1Graph)); + + ScheduleIt++; + ASSERT_ANY_THROW( + sycl::detail::getSyclObjImpl(MainGraph)->getEventForNode(*ScheduleIt)); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Graph)->getEventForNode(*ScheduleIt), + sycl::detail::getSyclObjImpl(Node2Graph)); + + ScheduleIt++; + ASSERT_ANY_THROW( + sycl::detail::getSyclObjImpl(Graph)->getEventForNode(*ScheduleIt)); + ASSERT_EQ( + sycl::detail::getSyclObjImpl(MainGraph)->getEventForNode(*ScheduleIt), + sycl::detail::getSyclObjImpl(Node3MainGraph)); + ASSERT_EQ(Queue.get_context(), MainGraphExecImpl->getContext()); +} + +TEST_F(CommandGraphTest, InOrderQueue) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + // Record in-order queue with three nodes + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + + auto PtrNode1 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode1, nullptr); + ASSERT_TRUE(PtrNode1->MPredecessors.empty()); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + + auto PtrNode2 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode2, nullptr); + ASSERT_NE(PtrNode2, PtrNode1); + ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode1->MSuccessors.front(), PtrNode2); + ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + + auto PtrNode3 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode3, nullptr); + ASSERT_NE(PtrNode3, PtrNode2); + ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MSuccessors.front(), PtrNode3); + ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2); + + InOrderGraph.end_recording(InOrderQueue); + + // Finalize main graph and check schedule + auto GraphExec = InOrderGraph.finalize(); + auto GraphExecImpl = sycl::detail::getSyclObjImpl(GraphExec); + auto Schedule = GraphExecImpl->getSchedule(); + auto ScheduleIt = Schedule.begin(); + ASSERT_EQ(Schedule.size(), 3ul); + ASSERT_EQ(*ScheduleIt, PtrNode1); + ScheduleIt++; + ASSERT_EQ(*ScheduleIt, PtrNode2); + ScheduleIt++; + ASSERT_EQ(*ScheduleIt, PtrNode3); + ASSERT_EQ(InOrderQueue.get_context(), GraphExecImpl->getContext()); +} + +TEST_F(CommandGraphTest, InOrderQueueWithEmpty) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + // Record in-order queue with a regular node then empty node then a regular + // node + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + + auto PtrNode1 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode1, nullptr); + ASSERT_TRUE(PtrNode1->MPredecessors.empty()); + + auto Node2Graph = InOrderQueue.submit([&](sycl::handler &cgh) {}); + + auto PtrNode2 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode2, nullptr); + ASSERT_NE(PtrNode2, PtrNode1); + ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode1->MSuccessors.front(), PtrNode2); + ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + + auto PtrNode3 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode3, nullptr); + ASSERT_NE(PtrNode3, PtrNode2); + ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MSuccessors.front(), PtrNode3); + ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2); + + InOrderGraph.end_recording(InOrderQueue); + + // Finalize main graph and check schedule + // Note that empty nodes are not scheduled + auto GraphExec = InOrderGraph.finalize(); + auto GraphExecImpl = sycl::detail::getSyclObjImpl(GraphExec); + auto Schedule = GraphExecImpl->getSchedule(); + auto ScheduleIt = Schedule.begin(); + ASSERT_EQ(Schedule.size(), 2ul); + ASSERT_EQ(*ScheduleIt, PtrNode1); + ScheduleIt++; + ASSERT_EQ(*ScheduleIt, PtrNode3); + ASSERT_EQ(InOrderQueue.get_context(), GraphExecImpl->getContext()); +} + +TEST_F(CommandGraphTest, InOrderQueueWithEmptyFirst) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + // Record in-order queue with an empty node then two regular nodes + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit([&](sycl::handler &cgh) {}); + + auto PtrNode1 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode1, nullptr); + ASSERT_TRUE(PtrNode1->MPredecessors.empty()); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + + auto PtrNode2 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode2, nullptr); + ASSERT_NE(PtrNode2, PtrNode1); + ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode1->MSuccessors.front(), PtrNode2); + ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + + auto PtrNode3 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode3, nullptr); + ASSERT_NE(PtrNode3, PtrNode2); + ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MSuccessors.front(), PtrNode3); + ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2); + + InOrderGraph.end_recording(InOrderQueue); + + // Finalize main graph and check schedule + // Note that empty nodes are not scheduled + auto GraphExec = InOrderGraph.finalize(); + auto GraphExecImpl = sycl::detail::getSyclObjImpl(GraphExec); + auto Schedule = GraphExecImpl->getSchedule(); + auto ScheduleIt = Schedule.begin(); + ASSERT_EQ(Schedule.size(), 2ul); + ASSERT_EQ(*ScheduleIt, PtrNode2); + ScheduleIt++; + ASSERT_EQ(*ScheduleIt, PtrNode3); + ASSERT_EQ(InOrderQueue.get_context(), GraphExecImpl->getContext()); +} + +TEST_F(CommandGraphTest, InOrderQueueWithEmptyLast) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + // Record in-order queue with two regular nodes then an empty node + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + + auto PtrNode1 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode1, nullptr); + ASSERT_TRUE(PtrNode1->MPredecessors.empty()); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + + auto PtrNode2 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode2, nullptr); + ASSERT_NE(PtrNode2, PtrNode1); + ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode1->MSuccessors.front(), PtrNode2); + ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); + + auto Node3Graph = InOrderQueue.submit([&](sycl::handler &cgh) {}); + + auto PtrNode3 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode3, nullptr); + ASSERT_NE(PtrNode3, PtrNode2); + ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MSuccessors.front(), PtrNode3); + ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2); + + InOrderGraph.end_recording(InOrderQueue); + + // Finalize main graph and check schedule + // Note that empty nodes are not scheduled + auto GraphExec = InOrderGraph.finalize(); + auto GraphExecImpl = sycl::detail::getSyclObjImpl(GraphExec); + auto Schedule = GraphExecImpl->getSchedule(); + auto ScheduleIt = Schedule.begin(); + ASSERT_EQ(Schedule.size(), 2ul); + ASSERT_EQ(*ScheduleIt, PtrNode1); + ScheduleIt++; + ASSERT_EQ(*ScheduleIt, PtrNode2); + ASSERT_EQ(InOrderQueue.get_context(), GraphExecImpl->getContext()); +}