diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 4451c36038d11..7ec89c02eba0c 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -52,11 +52,10 @@ // 10.13 Added new PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS queue property. // 10.14 Add PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY as an extension for // piDeviceGetInfo. -// 10.15 Add new PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION queue property -// 10.16 Add command-buffer extension methods +// 10.15 Add command-buffer extension methods #define _PI_H_VERSION_MAJOR 10 -#define _PI_H_VERSION_MINOR 16 +#define _PI_H_VERSION_MINOR 15 #define _PI_STRING_HELPER(a) #a #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) @@ -572,14 +571,6 @@ constexpr pi_queue_properties PI_QUEUE_PROFILING_ENABLE = (1 << 1); constexpr pi_queue_properties PI_QUEUE_ON_DEVICE = (1 << 2); constexpr pi_queue_properties PI_QUEUE_ON_DEVICE_DEFAULT = (1 << 3); constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS = (1 << 4); -// In a lazy queue, enqueued commands are not submitted for execution -// immediately, instead they are submitted for execution once the queue is -// flushed. -// -// This is to enable prototyping of the SYCL_EXT_ONEAPI_GRAPH extension, -// before a native command-list interface in PI can be designed and -// implemented. -constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION = (1 << 5); using pi_result = _pi_result; using pi_platform_info = _pi_platform_info; diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index f3fef118d3aa4..a9cc6798b73f8 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -27,6 +27,7 @@ namespace experimental { namespace detail { struct node_impl; struct graph_impl; +class exec_graph_impl; } // namespace detail @@ -125,18 +126,20 @@ template <> class __SYCL_EXPORT command_graph { public: command_graph() = delete; - command_graph(const std::shared_ptr &g, - const sycl::context &ctx) - : MTag(rand()), MCtx(ctx), impl(g) {} + command_graph(const std::shared_ptr &Graph, + const sycl::context &Ctx); private: template friend decltype(Obj::impl) sycl::detail::getSyclObjImpl(const Obj &SyclObject); + // Creates a backend representation of the graph in impl + void finalize_impl(); + int MTag; const sycl::context &MCtx; - std::shared_ptr impl; + std::shared_ptr impl; }; } // namespace experimental } // namespace oneapi diff --git a/sycl/include/sycl/properties/queue_properties.hpp b/sycl/include/sycl/properties/queue_properties.hpp index 643f16fbe7cb8..2457b4c561dcf 100644 --- a/sycl/include/sycl/properties/queue_properties.hpp +++ b/sycl/include/sycl/properties/queue_properties.hpp @@ -28,8 +28,6 @@ namespace property { namespace queue { class discard_events : public ::sycl::detail::DataLessProperty<::sycl::detail::DiscardEvents> {}; -class lazy_execution - : public ::sycl::detail::DataLessProperty<::sycl::detail::LazyExecution> {}; } // namespace queue } // namespace property @@ -67,9 +65,6 @@ template <> struct is_property : std::true_type {}; template <> -struct is_property - : std::true_type {}; -template <> struct is_property : std::true_type { }; template <> @@ -85,9 +80,6 @@ template <> struct is_property_of : std::true_type {}; template <> -struct is_property_of - : std::true_type {}; -template <> struct is_property_of : std::true_type {}; template <> diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 2ea4db7ab2c8c..91e7f07771984 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1307,49 +1307,6 @@ pi_result resetCommandLists(pi_queue Queue) { pi_result _pi_context::getAvailableCommandList( pi_queue Queue, pi_command_list_ptr_t &CommandList, bool UseCopyEngine, bool AllowBatching, ze_command_queue_handle_t *ForcedCmdQueue) { - -#if SYCL_EXT_ONEAPI_GRAPH - // This is a hack. TODO: Proper CommandList allocation per Executable Graph. - if( Queue->Properties & PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION ) { - // TODO: Create new Command List. - if(Queue->LazyCommandListMap.empty()) { - const bool UseCopyEngine = false; - // Adding createCommandList() to LazyCommandListMap - ze_fence_handle_t ZeFence; - ZeStruct ZeFenceDesc; - ze_command_list_handle_t ZeCommandList; - - uint32_t QueueGroupOrdinal; - auto &QGroup = Queue->getQueueGroup(UseCopyEngine); - auto &ZeCommandQueue = - // ForcedCmdQueue ? *ForcedCmdQueue : - QGroup.getZeQueue(&QueueGroupOrdinal); - // if (ForcedCmdQueue) - // QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue); - - ZeStruct ZeCommandListDesc; - ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; - - ZE_CALL(zeCommandListCreate, - (Queue->Context->ZeContext, Queue->Device->ZeDevice, - &ZeCommandListDesc, &ZeCommandList)); - - ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); - std::tie(CommandList, std::ignore) = Queue->LazyCommandListMap.insert( - std::pair( - ZeCommandList, - {ZeFence, false, ZeCommandQueue, QueueGroupOrdinal})); - - Queue->insertActiveBarriers(CommandList, UseCopyEngine); - // - CommandList->second.ZeFenceInUse = true; - } else { - CommandList = Queue->LazyCommandListMap.begin(); - } - return PI_SUCCESS; - } -#endif - // Immediate commandlists have been pre-allocated and are always available. if (Queue->Device->useImmediateCommandLists()) { CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList(); @@ -1588,13 +1545,6 @@ void _pi_queue::CaptureIndirectAccesses() { pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, bool IsBlocking, bool OKToBatchCommand) { - // When executing a Graph, defer execution if this is a command - // which could be batched (i.e. likely a kernel submission) -#if SYCL_EXT_ONEAPI_GRAPH - if (this->Properties & PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION && OKToBatchCommand) - return PI_SUCCESS; -#endif - bool UseCopyEngine = CommandList->second.isCopy(this); // If the current LastCommandEvent is the nullptr, then it means @@ -3560,8 +3510,7 @@ pi_result piQueueCreate(pi_context Context, pi_device Device, PI_ASSERT(!(Properties & ~(PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | PI_QUEUE_PROFILING_ENABLE | PI_QUEUE_ON_DEVICE | PI_QUEUE_ON_DEVICE_DEFAULT | - PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS | - PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION)), + PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS)), PI_ERROR_INVALID_VALUE); PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); @@ -3835,18 +3784,6 @@ pi_result piQueueFinish(pi_queue Queue) { // Flushing cross-queue dependencies is covered by createAndRetainPiZeEventList, // so this can be left as a no-op. pi_result piQueueFlush(pi_queue Queue) { -#if SYCL_EXT_ONEAPI_GRAPH - if( Queue->Properties & PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION ) { - - pi_command_list_ptr_t CommandList{}; - // TODO: - CommandList = Queue->LazyCommandListMap.begin(); - - Queue->executeCommandList(CommandList, false, false); - } -#else - (void)Queue; -#endif return PI_SUCCESS; } diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 28dfc16223448..6c14f72907206 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include +#include #include #include #include @@ -20,48 +21,30 @@ namespace oneapi { namespace experimental { namespace detail { -void graph_impl::exec(const std::shared_ptr &Queue) { +void exec_graph_impl::schedule() { if (MSchedule.empty()) { - for (auto Node : MRoots) { - Node->topology_sort(MSchedule); + for (auto Node : MGraphImpl->MRoots) { + Node->topology_sort(Node, MSchedule); } } - for (auto Node : MSchedule) - Node->exec(Queue); } -void graph_impl::exec_and_wait( - const std::shared_ptr &Queue) { - bool IsSubGraph = Queue->getIsGraphSubmitting(); - if (!IsSubGraph) { - Queue->setIsGraphSubmitting(true); - } -#if SYCL_EXT_ONEAPI_GRAPH - if (MFirst) { - exec(Queue); - MFirst = false; - } -#else - exec(Queue); -#endif - if (!IsSubGraph) { - Queue->setIsGraphSubmitting(false); - Queue->wait(); - } +sycl::event +exec_graph_impl::exec(const std::shared_ptr &Queue) { + // TODO: Support subgraphs + sycl::event RetEvent = enqueue(Queue); + // TODO: Remove this queue wait. Currently waiting on the event returned from + // graph execution does not work. + Queue->wait(); + return RetEvent; } void graph_impl::add_root(const std::shared_ptr &Root) { MRoots.insert(Root); - for (auto Node : MSchedule) - Node->MScheduled = false; - MSchedule.clear(); } void graph_impl::remove_root(const std::shared_ptr &Root) { MRoots.erase(Root); - for (auto Node : MSchedule) - Node->MScheduled = false; - MSchedule.clear(); } // Recursive check if a graph node or its successors contains a given kernel @@ -99,7 +82,7 @@ graph_impl::add(const std::shared_ptr &Impl, // TODO: Encapsulate in separate function to avoid duplication if (!Dep.empty()) { for (auto N : Dep) { - N->register_successor(NodeImpl); // register successor + N->register_successor(NodeImpl, N); // register successor this->remove_root(NodeImpl); // remove receiver from root node // list } @@ -118,10 +101,11 @@ graph_impl::add(const std::shared_ptr &Impl, sycl::handler Handler{Impl}; CGF(Handler); + // TODO: Do we need to pass event dependencies here for the explicit API? return this->add(Impl, Handler.MKernel, Handler.MNDRDesc, Handler.MOSModuleHandle, Handler.MKernelName, Handler.MAccStorage, Handler.MLocalAccStorage, - Handler.MRequirements, Handler.MArgs, {}); + Handler.MRequirements, Handler.MArgs, Dep); } std::shared_ptr graph_impl::add( @@ -133,7 +117,8 @@ std::shared_ptr graph_impl::add( const std::vector &LocalAccStorage, const std::vector &Requirements, const std::vector &Args, - const std::vector> &Dep) { + const std::vector> &Dep, + const std::vector> &DepEvents) { const std::shared_ptr &NodeImpl = std::make_shared( Impl, Kernel, NDRDesc, OSModuleHandle, KernelName, AccStorage, LocalAccStorage, Requirements, Args); @@ -154,9 +139,21 @@ std::shared_ptr graph_impl::add( // Add any deps determined from accessor arguments into the dependency list Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end()); + + // Add any nodes specified by event dependencies into the dependency list + for (auto Dep : DepEvents) { + if (auto NodeImpl = MEventsMap.find(Dep); NodeImpl != MEventsMap.end()) { + Deps.push_back(NodeImpl->second); + } else { + throw sycl::exception(errc::invalid, + "Event dependency from handler::depends_on does " + "not correspond to a node within the graph"); + } + } + if (!Deps.empty()) { for (auto N : Deps) { - N->register_successor(NodeImpl); // register successor + N->register_successor(NodeImpl, N); // register successor this->remove_root(NodeImpl); // remove receiver from root node // list } @@ -177,39 +174,201 @@ bool graph_impl::clear_queues() { return AnyQueuesCleared; } -void node_impl::exec(const std::shared_ptr &Queue - _CODELOCPARAMDEF(&CodeLoc)) { - std::vector Deps; - for (auto Sender : MPredecessors) - Deps.push_back(Sender->get_event()); +// Check if nodes are empty and if so loop back through predecessors until we +// find the real dependency. +void find_real_deps(std::vector &Deps, + std::shared_ptr CurrentNode) { + if (CurrentNode->is_empty()) { + for (auto &N : CurrentNode->MPredecessors) { + auto NodeImpl = N.lock(); + find_real_deps(Deps, NodeImpl); + } + } else { + // Check if the dependency has already been added. + if (std::find(Deps.begin(), Deps.end(), CurrentNode->MPiSyncPoint) == + Deps.end()) + Deps.push_back(CurrentNode->MPiSyncPoint); + } +} + +void exec_graph_impl::create_pi_command_buffers(sycl::device D, + const sycl::context &Ctx) { + // TODO we only have a single command-buffer per graph here, but + // this will need to be multiple command-buffers for non-trivial graphs + pi_ext_command_buffer OutCommandBuffer; + pi_ext_command_buffer_desc Desc{}; + auto ContextImpl = sycl::detail::getSyclObjImpl(Ctx); + const sycl::detail::plugin &Plugin = ContextImpl->getPlugin(); + auto DeviceImpl = sycl::detail::getSyclObjImpl(D); + pi_result Res = + Plugin.call_nocheck( + ContextImpl->getHandleRef(), DeviceImpl->getHandleRef(), &Desc, + &OutCommandBuffer); + if (Res != pi_result::PI_SUCCESS) { + throw sycl::exception(errc::invalid, "Failed to create PI command-buffer"); + } + + MPiCommandBuffers[D] = OutCommandBuffer; + + // TODO extract kernel bundle logic from enqueueImpKernel + for (auto Node : MSchedule) { + pi_kernel PiKernel = nullptr; + std::mutex *KernelMutex = nullptr; + pi_program PiProgram = nullptr; + + auto Kernel = Node->MKernel; + if (Kernel != nullptr) { + PiKernel = Kernel->getHandleRef(); + } else { + std::tie(PiKernel, KernelMutex, PiProgram) = + sycl::detail::ProgramManager::getInstance().getOrCreateKernel( + Node->MOSModuleHandle, ContextImpl, DeviceImpl, Node->MKernelName, + nullptr); + } + + sycl::detail::ProgramManager::KernelArgMask EliminatedArgMask; + if (nullptr == Node->MKernel || !Node->MKernel->isCreatedFromSource()) { + EliminatedArgMask = + sycl::detail::ProgramManager::getInstance() + .getEliminatedKernelArgMask(Node->MOSModuleHandle, PiProgram, + Node->MKernelName); + } + + auto SetFunc = [&Plugin, &PiKernel, &Ctx](sycl::detail::ArgDesc &Arg, + size_t NextTrueIndex) { + sycl::detail::SetArgBasedOnType( + Plugin, PiKernel, + nullptr /* TODO: Handle spec constants and pass device image here */, + nullptr /* TODO: Pass getMemAllocation function for buffers */, Ctx, + false, Arg, NextTrueIndex); + }; + std::vector Args; + sycl::detail::applyFuncOnFilteredArgs(EliminatedArgMask, Node->MArgs, + SetFunc); + + std::vector Deps; + for (auto &N : Node->MPredecessors) { + find_real_deps(Deps, N.lock()); + } + + // add commands + // Remember this information before the range dimensions are reversed + const bool HasLocalSize = (Node->MNDRDesc.LocalSize[0] != 0); + + size_t RequiredWGSize[3] = {0, 0, 0}; + size_t *LocalSize = nullptr; + + if (HasLocalSize) + LocalSize = &Node->MNDRDesc.LocalSize[0]; + else { + Plugin.call( + PiKernel, DeviceImpl->getHandleRef(), + PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(RequiredWGSize), + RequiredWGSize, /* param_value_size_ret = */ nullptr); + + const bool EnforcedLocalSize = + (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 || + RequiredWGSize[2] != 0); + if (EnforcedLocalSize) + LocalSize = RequiredWGSize; + } + + Res = Plugin.call_nocheck< + sycl::detail::PiApiKind::piextCommandBufferNDRangeKernel>( + OutCommandBuffer, PiKernel, Node->MNDRDesc.Dims, + &Node->MNDRDesc.GlobalOffset[0], &Node->MNDRDesc.GlobalSize[0], + LocalSize, Deps.size(), Deps.size() ? Deps.data() : nullptr, + &Node->MPiSyncPoint); + + if (Res != pi_result::PI_SUCCESS) { + throw sycl::exception(errc::invalid, + "Failed to add kernel to PI command-buffer"); + } + } - // Enqueue kernel here instead of submit + Res = + Plugin.call_nocheck( + OutCommandBuffer); + if (Res != pi_result::PI_SUCCESS) { + throw sycl::exception(errc::invalid, + "Failed to finalize PI command-buffer"); + } +} +exec_graph_impl::~exec_graph_impl() { + MSchedule.clear(); + for (auto Iter : MPiCommandBuffers) { + const sycl::detail::plugin &Plugin = + sycl::detail::getSyclObjImpl(MContext)->getPlugin(); + auto CmdBuf = Iter.second; + pi_result Res = + Plugin.call_nocheck( + CmdBuf); + (void)Res; + assert(Res == pi_result::PI_SUCCESS); + } +} + +sycl::event exec_graph_impl::enqueue( + const std::shared_ptr &Queue) { std::vector RawEvents; - pi_event *OutEvent = nullptr; - auto NewEvent = std::make_shared(Queue); - NewEvent->setContextImpl(Queue->getContextImplPtr()); - NewEvent->setStateIncomplete(); - OutEvent = &NewEvent->getHandleRef(); + auto CreateNewEvent([&]() { + pi_event *OutEvent = nullptr; + auto NewEvent = std::make_shared(Queue); + NewEvent->setContextImpl(Queue->getContextImplPtr()); + NewEvent->setStateIncomplete(); + OutEvent = &NewEvent->getHandleRef(); + pi_result Res = + Queue->getPlugin().call_nocheck( + sycl::detail::getSyclObjImpl(Queue->get_context())->getHandleRef(), + OutEvent); + if (Res != pi_result::PI_SUCCESS) { + throw sycl::exception(errc::event, + "Failed to create event for node submission"); + } + return NewEvent; + }); +#if SYCL_EXT_ONEAPI_GRAPH + auto NewEvent = CreateNewEvent(); + pi_event *OutEvent = &NewEvent->getHandleRef(); + auto CommandBuffer = MPiCommandBuffers[Queue->get_device()]; pi_result Res = - Queue->getPlugin().call_nocheck( - sycl::detail::getSyclObjImpl(Queue->get_context())->getHandleRef(), - OutEvent); + Queue->getPlugin() + .call_nocheck( + CommandBuffer, Queue->getHandleRef(), RawEvents.size(), + RawEvents.empty() ? nullptr : &RawEvents[0], OutEvent); if (Res != pi_result::PI_SUCCESS) { throw sycl::exception(errc::event, - "Failed to create event for node submission"); + "Failed to enqueue event for node submission"); } - pi_int32 Result = enqueueImpKernel( - Queue, MNDRDesc, MArgs, /* KernelBundleImpPtr */ nullptr, MKernel, - MKernelName, MOSModuleHandle, RawEvents, OutEvent, nullptr); - if (Result != pi_result::PI_SUCCESS) { - throw sycl::exception(errc::kernel, "Error enqueuing graph node kernel"); +#else + std::vector> ScheduledEvents; + for (auto &NodeImpl : MSchedule) { + std::vector RawEvents; + auto NewEvent = CreateNewEvent(); + pi_event *OutEvent = &NewEvent->getHandleRef(); + pi_int32 Res = sycl::detail::enqueueImpKernel( + Queue, NodeImpl->MNDRDesc, NodeImpl->MArgs, + nullptr /* TODO: Handle KernelBundles */, NodeImpl->MKernel, + NodeImpl->MKernelName, NodeImpl->MOSModuleHandle, RawEvents, OutEvent, + nullptr /* TODO: Pass mem allocation func for accessors */); + if (Res != pi_result::PI_SUCCESS) { + throw sycl::exception( + sycl::errc::kernel, + "Error during emulated graph command group submission."); + } + ScheduledEvents.push_back(NewEvent); } + // Create an event which has all kernel events as dependencies + auto NewEvent = std::make_shared(Queue); + NewEvent->setStateIncomplete(); + NewEvent->getPreparedDepsEvents() = ScheduledEvents; +#endif + sycl::event QueueEvent = sycl::detail::createSyclObjFromImpl(NewEvent); - Queue->addEvent(QueueEvent); - MEvent = QueueEvent; + return QueueEvent; } } // namespace detail @@ -251,7 +410,8 @@ void command_graph::make_edge(node Sender, std::shared_ptr ReceiverImpl = sycl::detail::getSyclObjImpl(Receiver); - SenderImpl->register_successor(ReceiverImpl); // register successor + SenderImpl->register_successor(ReceiverImpl, + SenderImpl); // register successor impl->remove_root(ReceiverImpl); // remove receiver from root node list } @@ -322,6 +482,23 @@ bool command_graph::end_recording( return QueueStateChanged; } +command_graph::command_graph( + const std::shared_ptr &Graph, const sycl::context &Ctx) + : MTag(rand()), MCtx(Ctx), + impl(std::make_shared(Ctx, Graph)) { + finalize_impl(); // Create backend representation for executable graph +} + +void command_graph::finalize_impl() { + // Create PI command-buffers for each device in the finalized context + impl->schedule(); +#if SYCL_EXT_ONEAPI_GRAPH + for (auto device : MCtx.get_devices()) { + impl->create_pi_command_buffers(device, MCtx); + } +#endif +} + } // namespace experimental } // namespace oneapi } // namespace ext diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 6bfc274c35176..f820a0baea0d3 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -28,29 +28,18 @@ namespace oneapi { namespace experimental { namespace detail { -class wrapper { - using T = std::function; - T MFunc; - std::vector MDeps; - -public: - wrapper(T Func, const std::vector &Deps) - : MFunc(Func), MDeps(Deps){}; - - void operator()(sycl::handler &CGH) { - CGH.depends_on(MDeps); - std::invoke(MFunc, CGH); - } -}; - struct node_impl { - bool MScheduled; - std::shared_ptr MGraph; - sycl::event MEvent; + /// ID representing this node in the graph + /// TODO this should be attached to an executable graph, rather than + /// a modifiable graph + pi_ext_sync_point MPiSyncPoint; + // List of successors to this node. std::vector> MSuccessors; - std::vector> MPredecessors; + // List of predecessors to this node. Using weak_ptr here to prevent circular + // references between nodes. + std::vector> MPredecessors; /// Kernel to be executed by this node std::shared_ptr MKernel; @@ -71,22 +60,20 @@ struct node_impl { // may go out of scope before execution. std::vector> MArgStorage; - void exec(const std::shared_ptr &Queue - _CODELOCPARAM(&CodeLoc)); + bool MIsEmpty = false; - void register_successor(const std::shared_ptr &Node) { + void register_successor(const std::shared_ptr &Node, + const std::shared_ptr &Prev) { MSuccessors.push_back(Node); - Node->register_predecessor(std::shared_ptr(this)); + Node->register_predecessor(Prev); } void register_predecessor(const std::shared_ptr &Node) { MPredecessors.push_back(Node); } - sycl::event get_event(void) const { return MEvent; } - node_impl(const std::shared_ptr &Graph) - : MScheduled(false), MGraph(Graph) {} + : MGraph(Graph), MIsEmpty(true) {} node_impl( const std::shared_ptr &Graph, @@ -97,7 +84,7 @@ struct node_impl { const std::vector &LocalAccStorage, const std::vector &Requirements, const std::vector &args) - : MScheduled(false), MGraph(Graph), MKernel(Kernel), MNDRDesc(NDRDesc), + : MGraph(Graph), MKernel(Kernel), MNDRDesc(NDRDesc), MOSModuleHandle(OSModuleHandle), MKernelName(KernelName), MAccStorage(AccStorage), MLocalAccStorage(LocalAccStorage), MRequirements(Requirements), MArgs(args), MArgStorage() { @@ -116,14 +103,17 @@ struct node_impl { } // Recursively adding nodes to execution stack: - void topology_sort(std::list> &Schedule) { - MScheduled = true; + void topology_sort(std::shared_ptr NodeImpl, + std::list> &Schedule) { for (auto Next : MSuccessors) { - if (!Next->MScheduled) - Next->topology_sort(Schedule); + // Check if we've already scheduled this node + if (std::find(Schedule.begin(), Schedule.end(), Next) == Schedule.end()) + Next->topology_sort(Next, Schedule); } - if (MKernel != nullptr) - Schedule.push_front(std::shared_ptr(this)); + // We don't need to schedule empty nodes as they are only used when + // calculating dependencies + if (!NodeImpl->is_empty()) + Schedule.push_front(NodeImpl); } bool has_arg(const sycl::detail::ArgDesc &Arg) { @@ -140,19 +130,15 @@ struct node_impl { } return false; } + + bool is_empty() const { return MIsEmpty; } }; struct graph_impl { std::set> MRoots; - std::list> MSchedule; - // TODO: Change one time initialization to per executable object - bool MFirst; std::shared_ptr MParent; - void exec(const std::shared_ptr &); - void exec_and_wait(const std::shared_ptr &); - void add_root(const std::shared_ptr &); void remove_root(const std::shared_ptr &); @@ -165,7 +151,9 @@ struct graph_impl { const std::vector &LocalAccStorage, const std::vector &Requirements, const std::vector &Args, - const std::vector> &Dep = {}); + const std::vector> &Dep = {}, + const std::vector> &DepEvents = + {}); std::shared_ptr add(const std::shared_ptr &Impl, @@ -177,7 +165,7 @@ struct graph_impl { add(const std::shared_ptr &Impl, const std::vector> &Dep = {}); - graph_impl() : MFirst(true) {} + graph_impl() = default; /// Add a queue to the set of queues which are currently recording to this /// graph. @@ -198,8 +186,45 @@ struct graph_impl { /// removed. bool clear_queues(); + void add_event_for_node(std::shared_ptr EventImpl, + std::shared_ptr NodeImpl) { + MEventsMap[EventImpl] = NodeImpl; + } + private: std::set> MRecordingQueues; + // Map of events to their associated recorded nodes. + std::unordered_map, + std::shared_ptr> + MEventsMap; +}; + +class exec_graph_impl { +public: + exec_graph_impl(sycl::context Context, + const std::shared_ptr &GraphImpl) + : MSchedule(), MGraphImpl(GraphImpl), MPiCommandBuffers(), + MContext(Context) {} + ~exec_graph_impl(); + /// Add nodes to MSchedule + void schedule(); + /// Enqueues the backend objects for the graph to the parametrized queue + sycl::event enqueue(const std::shared_ptr &); + /// Called by handler::ext_oneapi_command_graph() to schedule graph for + /// execution + sycl::event exec(const std::shared_ptr &); + /// Turns our internal graph representation into PI command-buffers for a + /// device + void create_pi_command_buffers(sycl::device D, const sycl::context &Ctx); + +private: + std::list> MSchedule; + // Pointer to the modifiable graph impl associated with this executable graph + std::shared_ptr MGraphImpl; + // Map of devices to command buffers + std::unordered_map MPiCommandBuffers; + // Context associated with this executable graph + sycl::context MContext; }; } // namespace detail diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 61c860ab3b060..6e9eab5828d2f 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -279,14 +279,6 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId); #endif -#if SYCL_EXT_ONEAPI_GRAPH - if (has_property()) { - const detail::plugin &Plugin = getPlugin(); - if (Plugin.getBackend() == backend::ext_oneapi_level_zero) - Plugin.call(getHandleRef()); - } -#endif - std::vector> WeakEvents; std::vector SharedEvents; { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index ba67dcfe482d9..ce121ab290af4 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -318,10 +318,7 @@ class queue_impl { // queue property. CreationFlags |= PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS; } - if (MPropList - .has_property()) { - CreationFlags |= PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION; - } + RT::PiQueue Queue{}; RT::PiContext Context = MContext->getHandleRef(); RT::PiDevice Device = MDevice->getHandleRef(); @@ -652,7 +649,8 @@ class queue_impl { // Command graph which is associated with this queue for the purposes of // recording commands to it. - std::shared_ptr MGraph; + std::shared_ptr MGraph = + nullptr; // This flag is set to true if a command_graph is currently submitting // commands to this queue. Used by subgraphs to determine if they are part of // a larger command graph submission. diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 8f5f18a090aa8..e38ce4075750d 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -84,7 +84,7 @@ static std::string deviceToString(device Device) { return "UNKNOWN"; } -static void applyFuncOnFilteredArgs( +void applyFuncOnFilteredArgs( const ProgramManager::KernelArgMask &EliminatedArgMask, std::vector &Args, std::function Func) { @@ -1985,6 +1985,72 @@ static void ReverseRangeDimensionsForKernel(NDRDescT &NDR) { } } +void SetArgBasedOnType( + const detail::plugin &Plugin, RT::PiKernel Kernel, + const std::shared_ptr &DeviceImageImpl, + const std::function &getMemAllocationFunc, + const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg, + size_t NextTrueIndex) { + switch (Arg.MType) { + case kernel_param_kind_t::kind_stream: + break; + case kernel_param_kind_t::kind_accessor: { + Requirement *Req = (Requirement *)(Arg.MPtr); + if (getMemAllocationFunc == nullptr) + throw sycl::exception(make_error_code(errc::kernel_argument), + "placeholder accessor must be bound by calling " + "handler::require() before it can be used."); + + RT::PiMem MemArg = (RT::PiMem)getMemAllocationFunc(Req); + if (Plugin.getBackend() == backend::opencl) { + Plugin.call(Kernel, NextTrueIndex, + sizeof(RT::PiMem), &MemArg); + } else { + Plugin.call(Kernel, NextTrueIndex, + &MemArg); + } + break; + } + case kernel_param_kind_t::kind_std_layout: { + Plugin.call(Kernel, NextTrueIndex, Arg.MSize, + Arg.MPtr); + break; + } + case kernel_param_kind_t::kind_sampler: { + sampler *SamplerPtr = (sampler *)Arg.MPtr; + RT::PiSampler Sampler = + detail::getSyclObjImpl(*SamplerPtr)->getOrCreateSampler(Context); + Plugin.call(Kernel, NextTrueIndex, + &Sampler); + break; + } + case kernel_param_kind_t::kind_pointer: { + Plugin.call(Kernel, NextTrueIndex, + Arg.MSize, Arg.MPtr); + break; + } + case kernel_param_kind_t::kind_specialization_constants_buffer: { + if (IsHost) { + throw sycl::feature_not_supported( + "SYCL2020 specialization constants are not yet supported on host " + "device", + PI_ERROR_INVALID_OPERATION); + } + assert(DeviceImageImpl != nullptr); + RT::PiMem SpecConstsBuffer = DeviceImageImpl->get_spec_const_buffer_ref(); + // Avoid taking an address of nullptr + RT::PiMem *SpecConstsBufferArg = + SpecConstsBuffer ? &SpecConstsBuffer : nullptr; + Plugin.call(Kernel, NextTrueIndex, + SpecConstsBufferArg); + break; + } + case kernel_param_kind_t::kind_invalid: + throw runtime_error("Invalid kernel param kind", PI_ERROR_INVALID_VALUE); + break; + } +} + static pi_result SetKernelParamsAndLaunch( const QueueImplPtr &Queue, std::vector &Args, const std::shared_ptr &DeviceImageImpl, @@ -1996,64 +2062,9 @@ static pi_result SetKernelParamsAndLaunch( auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc, &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) { - switch (Arg.MType) { - case kernel_param_kind_t::kind_stream: - break; - case kernel_param_kind_t::kind_accessor: { - Requirement *Req = (Requirement *)(Arg.MPtr); - if (getMemAllocationFunc == nullptr) - throw sycl::exception(make_error_code(errc::kernel_argument), - "placeholder accessor must be bound by calling " - "handler::require() before it can be used."); - - RT::PiMem MemArg = (RT::PiMem)getMemAllocationFunc(Req); - if (Plugin.getBackend() == backend::opencl) { - Plugin.call(Kernel, NextTrueIndex, - sizeof(RT::PiMem), &MemArg); - } else { - Plugin.call(Kernel, NextTrueIndex, - &MemArg); - } - break; - } - case kernel_param_kind_t::kind_std_layout: { - Plugin.call(Kernel, NextTrueIndex, Arg.MSize, - Arg.MPtr); - break; - } - case kernel_param_kind_t::kind_sampler: { - sampler *SamplerPtr = (sampler *)Arg.MPtr; - RT::PiSampler Sampler = detail::getSyclObjImpl(*SamplerPtr) - ->getOrCreateSampler(Queue->get_context()); - Plugin.call(Kernel, NextTrueIndex, - &Sampler); - break; - } - case kernel_param_kind_t::kind_pointer: { - Plugin.call(Kernel, NextTrueIndex, - Arg.MSize, Arg.MPtr); - break; - } - case kernel_param_kind_t::kind_specialization_constants_buffer: { - if (Queue->is_host()) { - throw sycl::feature_not_supported( - "SYCL2020 specialization constants are not yet supported on host " - "device", - PI_ERROR_INVALID_OPERATION); - } - assert(DeviceImageImpl != nullptr); - RT::PiMem SpecConstsBuffer = DeviceImageImpl->get_spec_const_buffer_ref(); - // Avoid taking an address of nullptr - RT::PiMem *SpecConstsBufferArg = - SpecConstsBuffer ? &SpecConstsBuffer : nullptr; - Plugin.call(Kernel, NextTrueIndex, - SpecConstsBufferArg); - break; - } - case kernel_param_kind_t::kind_invalid: - throw runtime_error("Invalid kernel param kind", PI_ERROR_INVALID_VALUE); - break; - } + SetArgBasedOnType(Plugin, Kernel, DeviceImageImpl, getMemAllocationFunc, + Queue->get_context(), Queue->is_host(), Arg, + NextTrueIndex); }; applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc); diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 77afa4936bc0a..ab86f8fbd9f3f 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -601,6 +601,21 @@ class UpdateHostRequirementCommand : public Command { void **MDstPtr = nullptr; }; +// Sets arguments for a given kernel and device based on the argument type. +// Refactored from SetKernelParamsAndLaunch to allow it to be used in the graphs +// extension. +void SetArgBasedOnType( + const detail::plugin &Plugin, RT::PiKernel Kernel, + const std::shared_ptr &DeviceImageImpl, + const std::function &getMemAllocationFunc, + const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg, + size_t NextTrueIndex); + +void applyFuncOnFilteredArgs( + const ProgramManager::KernelArgMask &EliminatedArgMask, + std::vector &Args, + std::function Func); + } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index e0546e5066453..8a550137e6bfd 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -100,10 +100,15 @@ event handler::finalize() { if (auto GraphImpl = MQueue->getCommandGraph(); GraphImpl != nullptr) { // Extract relevant data from the handler and pass to graph to create a new // node representing this command group. - GraphImpl->add(GraphImpl, MKernel, MNDRDesc, MOSModuleHandle, MKernelName, - MAccStorage, MLocalAccStorage, MRequirements, MArgs, {}); - return detail::createSyclObjFromImpl( - std::make_shared()); + auto NodeImpl = GraphImpl->add( + GraphImpl, MKernel, MNDRDesc, MOSModuleHandle, MKernelName, MAccStorage, + MLocalAccStorage, MRequirements, MArgs, {}, MEvents); + + // Create and associated an event with this new node + auto EventImpl = std::make_shared(); + GraphImpl->add_event_for_node(EventImpl, NodeImpl); + + return detail::createSyclObjFromImpl(EventImpl); } std::shared_ptr KernelBundleImpPtr = nullptr; @@ -716,7 +721,9 @@ void handler::ext_oneapi_graph( ext::oneapi::experimental::graph_state::executable> Graph) { auto GraphImpl = detail::getSyclObjImpl(Graph); - GraphImpl->exec_and_wait(MQueue); + auto GraphCompletionEvent = GraphImpl->exec(MQueue); + auto EventImpl = detail::getSyclObjImpl(GraphCompletionEvent); + MEvents.push_back(EventImpl); } } // __SYCL_INLINE_VER_NAMESPACE(_V1) diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 20fc669b8d591..47cdc71c70f9c 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -212,10 +212,5 @@ bool queue::device_has(aspect Aspect) const { // avoid creating sycl object from impl return impl->getDeviceImplPtr()->has(Aspect); } - -template __SYCL_EXPORT bool -queue::has_property() const; -template __SYCL_EXPORT ext::oneapi::property::queue::lazy_execution -queue::get_property() const; } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/test/graph/graph-explicit-dotp-buffer.cpp b/sycl/test/graph/graph-explicit-dotp-buffer.cpp index 0b795714f98dd..d84b3483656e7 100644 --- a/sycl/test/graph/graph-explicit-dotp-buffer.cpp +++ b/sycl/test/graph/graph-explicit-dotp-buffer.cpp @@ -24,11 +24,7 @@ int main() { float beta = 2.0f; float gamma = 3.0f; - sycl::property_list properties{ - sycl::property::queue::in_order{}, - sycl::ext::oneapi::property::queue::lazy_execution{}}; - - sycl::queue q{sycl::gpu_selector_v, properties}; + sycl::queue q{sycl::gpu_selector_v}; sycl::ext::oneapi::experimental::command_graph g; diff --git a/sycl/test/graph/graph-explicit-dotp-device-mem.cpp b/sycl/test/graph/graph-explicit-dotp-device-mem.cpp index 3163a6fb2ea64..d7bbfe0880294 100644 --- a/sycl/test/graph/graph-explicit-dotp-device-mem.cpp +++ b/sycl/test/graph/graph-explicit-dotp-device-mem.cpp @@ -24,11 +24,7 @@ int main() { float beta = 2.0f; float gamma = 3.0f; - sycl::property_list properties{ - sycl::property::queue::in_order{}, - sycl::ext::oneapi::property::queue::lazy_execution{}}; - - sycl::queue q{sycl::gpu_selector_v, properties}; + sycl::queue q{sycl::gpu_selector_v}; sycl::ext::oneapi::experimental::command_graph g; @@ -68,12 +64,22 @@ int main() { auto node_c = g.add( [&](sycl::handler &h) { +#ifdef TEST_GRAPH_REDUCTIONS h.parallel_for(sycl::range<1>{n}, sycl::reduction(dotp, 0.0f, std::plus()), [=](sycl::id<1> it, auto &sum) { const size_t i = it[0]; sum += x[i] * z[i]; }); +#else + h.single_task([=]() { + // Doing a manual reduction here because reduction objects cause + // issues with graphs. + for (size_t j = 0; j < n; j++) { + dotp[0] += x[j] * z[j]; + } + }); +#endif }, {node_a, node_b}); diff --git a/sycl/test/graph/graph-explicit-dotp.cpp b/sycl/test/graph/graph-explicit-dotp.cpp index 7d0a5bcdae30e..0020db49737c9 100644 --- a/sycl/test/graph/graph-explicit-dotp.cpp +++ b/sycl/test/graph/graph-explicit-dotp.cpp @@ -24,11 +24,7 @@ int main() { float beta = 2.0f; float gamma = 3.0f; - sycl::property_list properties{ - sycl::property::queue::in_order{}, - sycl::ext::oneapi::property::queue::lazy_execution{}}; - - sycl::queue q{sycl::gpu_selector_v, properties}; + sycl::queue q{sycl::gpu_selector_v}; sycl::ext::oneapi::experimental::command_graph g; @@ -68,12 +64,22 @@ int main() { auto node_c = g.add( [&](sycl::handler &h) { +#ifdef TEST_GRAPH_REDUCTIONS h.parallel_for(sycl::range<1>{n}, sycl::reduction(dotp, 0.0f, std::plus()), [=](sycl::id<1> it, auto &sum) { const size_t i = it[0]; sum += x[i] * z[i]; }); +#else + h.single_task([=]() { + // Doing a manual reduction here because reduction objects cause + // issues with graphs. + for (size_t j = 0; j < n; j++) { + dotp[0] += x[j] * z[j]; + } + }); +#endif }, {node_a, node_b}); diff --git a/sycl/test/graph/graph-explicit-empty.cpp b/sycl/test/graph/graph-explicit-empty.cpp index 7917a98fdf2f4..9b3a9b5bf7726 100644 --- a/sycl/test/graph/graph-explicit-empty.cpp +++ b/sycl/test/graph/graph-explicit-empty.cpp @@ -5,46 +5,48 @@ int main() { - sycl::property_list properties{ - sycl::property::queue::in_order{}, - sycl::ext::oneapi::property::queue::lazy_execution{}}; - - sycl::queue q{sycl::gpu_selector_v, properties}; + sycl::queue q{sycl::gpu_selector_v}; sycl::ext::oneapi::experimental::command_graph g; const size_t n = 10; - float *h_arr = sycl::malloc_host(n, q); float *arr = sycl::malloc_device(n, q); - + auto start = g.add(); - auto init = g.add([&](sycl::handler &h) { - h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx) { - size_t i = idx; - arr[i] = 0; - }); - }, {start}); - + auto init = g.add( + [&](sycl::handler &h) { + h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx) { + size_t i = idx; + arr[i] = 0; + }); + }, + {start}); + auto empty = g.add({init}); + auto empty2 = g.add({empty}); - g.add([&](sycl::handler &h) { - h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx) { - size_t i = idx; - arr[i] = 1; - }); - }, {empty}); + g.add( + [&](sycl::handler &h) { + h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx) { + size_t i = idx; + arr[i] = 1; + }); + }, + {empty2}); auto executable_graph = g.finalize(q.get_context()); - q.submit([&](sycl::handler &h) { h.ext_oneapi_graph(executable_graph); }).wait(); + q.submit([&](sycl::handler &h) { + h.ext_oneapi_graph(executable_graph); + }).wait(); - q.memcpy(&(h_arr[0]), arr, n).wait(); + std::vector HostData(n); + q.memcpy(HostData.data(), arr, n * sizeof(float)).wait(); for (int i = 0; i < n; i++) - assert(h_arr[i] == 1.0f); + assert(HostData[i] == 1.f); - sycl::free(h_arr, q); sycl::free(arr, q); return 0; diff --git a/sycl/test/graph/graph-explicit-multiple-exec-graphs.cpp b/sycl/test/graph/graph-explicit-multiple-exec-graphs.cpp new file mode 100644 index 0000000000000..ece78bda980f3 --- /dev/null +++ b/sycl/test/graph/graph-explicit-multiple-exec-graphs.cpp @@ -0,0 +1,100 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +#include + +#include + +const size_t n = 10; + +float host_gold_result() { + float alpha = 1.0f; + float beta = 2.0f; + float gamma = 3.0f; + + float sum = 0.0f; + + for (size_t i = 0; i < n; ++i) { + sum += (alpha * 1.0f + beta * 2.0f) * (gamma * 3.0f + beta * 2.0f); + } + + return sum; +} + +int main() { + float alpha = 1.0f; + float beta = 2.0f; + float gamma = 3.0f; + + sycl::queue q{sycl::gpu_selector_v}; + + sycl::ext::oneapi::experimental::command_graph g; + + float *dotp = sycl::malloc_shared(1, q); + + float *x = sycl::malloc_shared(n, q); + float *y = sycl::malloc_shared(n, q); + float *z = sycl::malloc_shared(n, q); + + /* init data on the device */ + auto n_i = g.add([&](sycl::handler &h) { + h.parallel_for(n, [=](sycl::id<1> it) { + const size_t i = it[0]; + x[i] = 1.0f; + y[i] = 2.0f; + z[i] = 3.0f; + dotp[0] = 0.0f; + }); + }); + + auto node_a = g.add( + [&](sycl::handler &h) { + h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { + const size_t i = it[0]; + x[i] = alpha * x[i] + beta * y[i]; + }); + }, + {n_i}); + + auto node_b = g.add( + [&](sycl::handler &h) { + h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { + const size_t i = it[0]; + z[i] = gamma * z[i] + beta * y[i]; + }); + }, + {n_i}); + + auto node_c = g.add( + [&](sycl::handler &h) { + h.single_task([=]() { + for (size_t j = 0; j < n; j++) { + dotp[0] += x[j] * z[j]; + } + }); + }, + {node_a, node_b}); + + auto executable_graph = g.finalize(q.get_context()); + + // Add an extra node for the second executable graph which modifies the output + auto node_d = + g.add([&](sycl::handler &h) { h.single_task([=]() { dotp[0] += 1; }); }, + {node_c}); + + auto executable_graph_2 = g.finalize(q.get_context()); + + // Using shortcut for executing a graph of commands + q.ext_oneapi_graph(executable_graph).wait(); + + assert(*dotp == host_gold_result()); + + q.ext_oneapi_graph(executable_graph_2).wait(); + + assert(*dotp == host_gold_result() + 1); + + sycl::free(dotp, q); + sycl::free(x, q); + sycl::free(y, q); + sycl::free(z, q); + + return 0; +} diff --git a/sycl/test/graph/graph-explicit-node-ordering.cpp b/sycl/test/graph/graph-explicit-node-ordering.cpp index 2ac11bbba28f2..5f6bf86bdbb23 100644 --- a/sycl/test/graph/graph-explicit-node-ordering.cpp +++ b/sycl/test/graph/graph-explicit-node-ordering.cpp @@ -5,11 +5,7 @@ int main() { - sycl::property_list properties{ - sycl::property::queue::in_order{}, - sycl::ext::oneapi::property::queue::lazy_execution{}}; - - sycl::queue q{sycl::gpu_selector_v, properties}; + sycl::queue q{sycl::gpu_selector_v}; sycl::ext::oneapi::experimental::command_graph g; diff --git a/sycl/test/graph/graph-explicit-queue-shortcuts.cpp b/sycl/test/graph/graph-explicit-queue-shortcuts.cpp index 8f6d0c668af8b..145e10c241c9b 100644 --- a/sycl/test/graph/graph-explicit-queue-shortcuts.cpp +++ b/sycl/test/graph/graph-explicit-queue-shortcuts.cpp @@ -5,11 +5,7 @@ int main() { - sycl::property_list properties{ - sycl::property::queue::in_order{}, - sycl::ext::oneapi::property::queue::lazy_execution{}}; - - sycl::queue q{sycl::gpu_selector_v, properties}; + sycl::queue q{sycl::gpu_selector_v}; // Test passing empty property list, which is the default sycl::property_list empty_properties; diff --git a/sycl/test/graph/graph-explicit-reduction.cpp b/sycl/test/graph/graph-explicit-reduction.cpp index d652c1d9e3ada..d18ef5dd48a3e 100644 --- a/sycl/test/graph/graph-explicit-reduction.cpp +++ b/sycl/test/graph/graph-explicit-reduction.cpp @@ -4,11 +4,7 @@ #include int main() { - sycl::property_list properties{ - sycl::property::queue::in_order{}, - sycl::ext::oneapi::property::queue::lazy_execution{}}; - - sycl::queue q{sycl::gpu_selector_v, properties}; + sycl::queue q{sycl::gpu_selector_v}; sycl::ext::oneapi::experimental::command_graph g; diff --git a/sycl/test/graph/graph-explicit-repeated-exec.cpp b/sycl/test/graph/graph-explicit-repeated-exec.cpp index 23b9f8fbead7b..9fe8c7cfe6708 100644 --- a/sycl/test/graph/graph-explicit-repeated-exec.cpp +++ b/sycl/test/graph/graph-explicit-repeated-exec.cpp @@ -5,11 +5,7 @@ int main() { - sycl::property_list properties{ - sycl::property::queue::in_order{}, - sycl::ext::oneapi::property::queue::lazy_execution{}}; - - sycl::queue q{sycl::gpu_selector_v, properties}; + sycl::queue q{sycl::gpu_selector_v}; sycl::ext::oneapi::experimental::command_graph g; diff --git a/sycl/test/graph/graph-explicit-saxpy.cpp b/sycl/test/graph/graph-explicit-saxpy.cpp index e8a422ad23f3a..fd4bf797d73ba 100644 --- a/sycl/test/graph/graph-explicit-saxpy.cpp +++ b/sycl/test/graph/graph-explicit-saxpy.cpp @@ -5,11 +5,7 @@ int main() { - sycl::property_list properties{ - sycl::property::queue::in_order{}, - sycl::ext::oneapi::property::queue::lazy_execution{}}; - - sycl::queue q{sycl::gpu_selector_v, properties}; + sycl::queue q{sycl::gpu_selector_v}; sycl::ext::oneapi::experimental::command_graph g; diff --git a/sycl/test/graph/graph-explicit-single-node.cpp b/sycl/test/graph/graph-explicit-single-node.cpp index b1461f308edc0..6047b6008cea8 100644 --- a/sycl/test/graph/graph-explicit-single-node.cpp +++ b/sycl/test/graph/graph-explicit-single-node.cpp @@ -6,11 +6,7 @@ int main() { - sycl::property_list properties{ - sycl::property::queue::in_order{}, - sycl::ext::oneapi::property::queue::lazy_execution{}}; - - sycl::queue q{sycl::gpu_selector_v, properties}; + sycl::queue q{sycl::gpu_selector_v}; sycl::ext::oneapi::experimental::command_graph g; @@ -23,7 +19,7 @@ int main() { g.add([&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx) { size_t i = idx; - arr[i] = 1; + arr[i] = 3.14f; }); }); @@ -42,7 +38,7 @@ int main() { }).wait(); for (int i = 0; i < n; i++) - assert(arr[i] == 1); + assert(arr[i] == 3.14f); sycl::free(arr, q); diff --git a/sycl/test/graph/graph-explicit-subgraph.cpp b/sycl/test/graph/graph-explicit-subgraph.cpp index 5510f62a50c9c..5b33e53d417f5 100644 --- a/sycl/test/graph/graph-explicit-subgraph.cpp +++ b/sycl/test/graph/graph-explicit-subgraph.cpp @@ -27,11 +27,7 @@ int main() { float beta = 2.0f; float gamma = 3.0f; - sycl::property_list properties{ - sycl::property::queue::in_order{}, - sycl::ext::oneapi::property::queue::lazy_execution{}}; - - sycl::queue q{sycl::gpu_selector_v, properties}; + sycl::queue q{sycl::gpu_selector_v}; sycl::ext::oneapi::experimental::command_graph g; sycl::ext::oneapi::experimental::command_graph subGraph; diff --git a/sycl/test/graph/graph-record-dotp-buffer.cpp b/sycl/test/graph/graph-record-dotp-buffer.cpp index 0ee55942f031b..39da200194539 100644 --- a/sycl/test/graph/graph-record-dotp-buffer.cpp +++ b/sycl/test/graph/graph-record-dotp-buffer.cpp @@ -24,11 +24,7 @@ int main() { float beta = 2.0f; float gamma = 3.0f; - sycl::property_list properties{ - sycl::property::queue::in_order(), - sycl::ext::oneapi::property::queue::lazy_execution{}}; - - sycl::queue q{sycl::gpu_selector_v, properties}; + sycl::queue q{sycl::gpu_selector_v}; sycl::ext::oneapi::experimental::command_graph g; diff --git a/sycl/test/graph/graph-record-dotp.cpp b/sycl/test/graph/graph-record-dotp.cpp index cfe9d59f39735..d6eb70f359370 100644 --- a/sycl/test/graph/graph-record-dotp.cpp +++ b/sycl/test/graph/graph-record-dotp.cpp @@ -24,11 +24,7 @@ int main() { float beta = 2.0f; float gamma = 3.0f; - sycl::property_list properties{ - sycl::property::queue::in_order(), - sycl::ext::oneapi::property::queue::lazy_execution{}}; - - sycl::queue q{sycl::gpu_selector_v, properties}; + sycl::queue q{sycl::gpu_selector_v}; sycl::ext::oneapi::experimental::command_graph g; @@ -41,7 +37,7 @@ int main() { g.begin_recording(q); /* init data on the device */ - q.submit([&](sycl::handler &h) { + auto initEvent = q.submit([&](sycl::handler &h) { h.parallel_for(n, [=](sycl::id<1> it) { const size_t i = it[0]; x[i] = 1.0f; @@ -50,14 +46,16 @@ int main() { }); }); - q.submit([&](sycl::handler &h) { + auto eventA = q.submit([&](sycl::handler &h) { + h.depends_on(initEvent); h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; x[i] = alpha * x[i] + beta * y[i]; }); }); - q.submit([&](sycl::handler &h) { + auto eventB = q.submit([&](sycl::handler &h) { + h.depends_on(initEvent); h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { const size_t i = it[0]; z[i] = gamma * z[i] + beta * y[i]; @@ -65,6 +63,7 @@ int main() { }); q.submit([&](sycl::handler &h) { + h.depends_on({eventA, eventB}); #ifdef TEST_GRAPH_REDUCTIONS h.parallel_for(sycl::range<1>{n}, sycl::reduction(dotp, 0.0f, std::plus()), [=](sycl::id<1> it, auto &sum) { diff --git a/sycl/test/graph/graph-record-simple.cpp b/sycl/test/graph/graph-record-simple.cpp index d1956d8704d70..b5e844ff7adaa 100644 --- a/sycl/test/graph/graph-record-simple.cpp +++ b/sycl/test/graph/graph-record-simple.cpp @@ -8,11 +8,7 @@ int main() { const size_t n = 10; const float expectedValue = 7.f; - sycl::property_list properties{ - sycl::property::queue::in_order(), - sycl::ext::oneapi::property::queue::lazy_execution{}}; - - sycl::queue q{sycl::default_selector_v, properties}; + sycl::queue q{sycl::default_selector_v}; sycl::queue q2; sycl::ext::oneapi::experimental::command_graph g; diff --git a/sycl/test/graph/graph-record-temp-scope.cpp b/sycl/test/graph/graph-record-temp-scope.cpp index a4ac296dd6d29..1adb83e6e6cf0 100644 --- a/sycl/test/graph/graph-record-temp-scope.cpp +++ b/sycl/test/graph/graph-record-temp-scope.cpp @@ -20,11 +20,7 @@ void run_some_kernel(sycl::queue q, float *data) { int main() { - sycl::property_list properties{ - sycl::property::queue::in_order(), - sycl::ext::oneapi::property::queue::lazy_execution{}}; - - sycl::queue q{sycl::default_selector_v, properties}; + sycl::queue q{sycl::default_selector_v}; sycl::ext::oneapi::experimental::command_graph g; diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index ccaab4e0c1fef..94cb586b16872 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -35,15 +35,6 @@ class CommandGraphTest : public ::testing::Test { experimental::command_graph Graph; }; -TEST_F(CommandGraphTest, LazyQueueProperty) { - sycl::property_list Props{ - sycl::ext::oneapi::property::queue::lazy_execution{}}; - - sycl::queue Queue{Dev, Props}; - bool hasProp = - Queue.has_property(); - ASSERT_TRUE(hasProp); -} TEST_F(CommandGraphTest, AddNode) { using namespace sycl::ext::oneapi; @@ -98,11 +89,8 @@ TEST_F(CommandGraphTest, MakeEdge) { TEST_F(CommandGraphTest, BeginEndRecording) { using namespace sycl::ext::oneapi; - sycl::property_list Props{ - sycl::ext::oneapi::property::queue::lazy_execution{}}; - - sycl::queue Queue{Dev, Props}; - sycl::queue Queue2{Dev, Props}; + sycl::queue Queue{Dev}; + sycl::queue Queue2{Dev}; // Test throwing behaviour // Check we can repeatedly begin recording on the same queues