diff --git a/sycl/include/CL/sycl/detail/pi.def b/sycl/include/CL/sycl/detail/pi.def index 59dab0c4721a1..41e0839eaaa4c 100644 --- a/sycl/include/CL/sycl/detail/pi.def +++ b/sycl/include/CL/sycl/detail/pi.def @@ -103,6 +103,7 @@ _PI_API(piSamplerGetInfo) _PI_API(piSamplerRetain) _PI_API(piSamplerRelease) // Queue commands +_PI_API(piKernelLaunch) _PI_API(piEnqueueKernelLaunch) _PI_API(piEnqueueNativeKernel) _PI_API(piEnqueueEventsWait) diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index 0e15200d19a6b..5a5ec40e2dc71 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -626,6 +626,7 @@ constexpr pi_queue_properties PI_QUEUE_PROFILING_ENABLE = constexpr pi_queue_properties PI_QUEUE_ON_DEVICE = CL_QUEUE_ON_DEVICE; constexpr pi_queue_properties PI_QUEUE_ON_DEVICE_DEFAULT = CL_QUEUE_ON_DEVICE_DEFAULT; +constexpr pi_queue_properties PI_QUEUE_LAZY_EXECUTION = 1 << 10; using pi_result = _pi_result; using pi_platform_info = _pi_platform_info; @@ -1488,6 +1489,8 @@ __SYCL_EXPORT pi_result piSamplerRelease(pi_sampler sampler); // // Queue Commands // +__SYCL_EXPORT pi_result piKernelLaunch(pi_queue queue); + __SYCL_EXPORT pi_result piEnqueueKernelLaunch( pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, diff --git a/sycl/include/CL/sycl/detail/property_helper.hpp b/sycl/include/CL/sycl/detail/property_helper.hpp index e4e8eea1007a4..7cd346f8f4455 100644 --- a/sycl/include/CL/sycl/detail/property_helper.hpp +++ b/sycl/include/CL/sycl/detail/property_helper.hpp @@ -35,8 +35,9 @@ enum DataLessPropKind { UseDefaultStream = 8, DiscardEvents = 9, DeviceReadOnly = 10, + LazyExecution = 11, // Indicates the last known dataless property. - LastKnownDataLessPropKind = 10, + LastKnownDataLessPropKind = 11, // Exceeding 32 may cause ABI breaking change on some of OSes. DataLessPropKindSize = 32 }; diff --git a/sycl/include/CL/sycl/feature_test.hpp.in b/sycl/include/CL/sycl/feature_test.hpp.in index 94e0447a8e0e6..eb80556ed2c9f 100644 --- a/sycl/include/CL/sycl/feature_test.hpp.in +++ b/sycl/include/CL/sycl/feature_test.hpp.in @@ -47,6 +47,7 @@ namespace sycl { #define SYCL_EXT_ONEAPI_FREE_FUNCTION_QUERIES 1 #define SYCL_EXT_ONEAPI_GROUP_ALGORITHMS 1 #define SYCL_EXT_ONEAPI_GROUP_SORT 1 +#define SYCL_EXT_ONEAPI_LAZY_QUEUE 1 #define SYCL_EXT_ONEAPI_MAX_WORK_GROUP_QUERY 1 #define SYCL_EXT_ONEAPI_ND_RANGE_REDUCTIONS 1 #define SYCL_EXT_ONEAPI_DEFAULT_CONTEXT 1 diff --git a/sycl/include/CL/sycl/properties/queue_properties.hpp b/sycl/include/CL/sycl/properties/queue_properties.hpp index 76a3bfaea9373..c9fb6e88d890c 100644 --- a/sycl/include/CL/sycl/properties/queue_properties.hpp +++ b/sycl/include/CL/sycl/properties/queue_properties.hpp @@ -28,6 +28,8 @@ namespace property { namespace queue { class discard_events : public ::cl::sycl::detail::DataLessProperty< ::cl::sycl::detail::DiscardEvents> {}; +class lazy_execution : public ::cl::sycl::detail::DataLessProperty< + ::cl::sycl::detail::LazyExecution> {}; } // namespace queue } // namespace property @@ -63,6 +65,9 @@ template <> struct is_property : std::true_type {}; template <> +struct is_property + : std::true_type {}; +template <> struct is_property : std::true_type { }; template <> @@ -78,6 +83,9 @@ template <> struct is_property_of : std::true_type {}; template <> +struct is_property_of + : std::true_type {}; +template <> struct is_property_of : std::true_type {}; template <> diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp new file mode 100644 index 0000000000000..304a928f4e7e4 --- /dev/null +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -0,0 +1,220 @@ +//==--------- graph.hpp --- SYCL graph extension ---------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#include +#include + +__SYCL_INLINE_NAMESPACE(cl) { +namespace sycl { +namespace ext { +namespace oneapi { +namespace experimental { +namespace detail { + +struct node_impl; + +struct graph_impl; + +using node_ptr = std::shared_ptr; + +using graph_ptr = std::shared_ptr; + +class wrapper { + using T = std::function; + T my_func; + std::vector my_deps; + +public: + wrapper(T t, const std::vector &deps) + : my_func(t), my_deps(deps){}; + + void operator()(sycl::handler &cgh) { + cgh.depends_on(my_deps); + std::invoke(my_func, cgh); + } +}; + +struct node_impl { + bool is_scheduled; + + graph_ptr my_graph; + sycl::event my_event; + + std::vector my_successors; + std::vector my_predecessors; + + std::function my_body; + + void exec(sycl::queue q) { + std::vector __deps; + for (auto i : my_predecessors) + __deps.push_back(i->get_event()); + my_event = q.submit(wrapper{my_body, __deps}); + } + + void register_successor(node_ptr n) { + my_successors.push_back(n); + n->register_predecessor(node_ptr(this)); + } + + void register_predecessor(node_ptr n) { my_predecessors.push_back(n); } + + sycl::event get_event(void) { return my_event; } + + template + node_impl(graph_ptr g, T cgf) + : is_scheduled(false), my_graph(g), my_body(cgf) {} + + // Recursively adding nodes to execution stack: + void topology_sort(std::list &schedule) { + is_scheduled = true; + for (auto i : my_successors) { + if (!i->is_scheduled) + i->topology_sort(schedule); + } + schedule.push_front(node_ptr(this)); + } +}; + +struct graph_impl { + std::set my_roots; + std::list my_schedule; + + graph_ptr parent; + + void exec(sycl::queue q) { + if (my_schedule.empty()) { + for (auto n : my_roots) { + n->topology_sort(my_schedule); + } + } + for (auto n : my_schedule) + n->exec(q); + } + + void exec_and_wait(sycl::queue q) { + exec(q); + q.wait(); + } + + void add_root(node_ptr n) { + my_roots.insert(n); + for (auto n : my_schedule) + n->is_scheduled = false; + my_schedule.clear(); + } + + void remove_root(node_ptr n) { + my_roots.erase(n); + for (auto n : my_schedule) + n->is_scheduled = false; + my_schedule.clear(); + } + + graph_impl() {} +}; + +} // namespace detail + +struct node { + // TODO: add properties to distinguish between empty, host, device nodes. + detail::node_ptr my_node; + detail::graph_ptr my_graph; + + template + node(detail::graph_ptr g, T cgf) + : my_graph(g), my_node(new detail::node_impl(g, cgf)){}; + void register_successor(node n) { my_node->register_successor(n.my_node); } + void exec(sycl::queue q, sycl::event = sycl::event()) { my_node->exec(q); } + + void set_root() { my_graph->add_root(my_node); } + + // TODO: Add query functions: is_root, ... +}; + +enum class graph_state{ + modifiable, + executable +}; + +template +class command_graph { +public: + // Adding empty node with [0..n] predecessors: + node add(const std::vector &dep = {}); + + // Adding device node: + template + node add(T cgf, const std::vector &dep = {}); + + // Adding dependency between two nodes. + void make_edge(node sender, node receiver); + + // TODO: Extend queue to directly submit graph + void exec_and_wait(sycl::queue q); + + command_graph finalize(const sycl::context &syclContext) const; + + command_graph() : my_graph(new detail::graph_impl()) {} + +private: + detail::graph_ptr my_graph; +}; + +template<> +class command_graph{ +public: + int my_tag; + const sycl::context& my_ctx; + + void exec_and_wait(sycl::queue q); + + command_graph() = delete; + + command_graph(detail::graph_ptr g, const sycl::context& ctx) + : my_graph(g) , my_ctx(ctx), my_tag(rand()) {} + +private: + detail::graph_ptr my_graph; +}; + +template<> template +node command_graph::add(T cgf, const std::vector &dep) { + node _node(my_graph, cgf); + if (!dep.empty()) { + for (auto n : dep) + this->make_edge(n, _node); + } else { + _node.set_root(); + } + return _node; +} + +template<> +void command_graph::make_edge(node sender, node receiver) { + sender.register_successor(receiver); // register successor + my_graph->remove_root(receiver.my_node); // remove receiver from root node + // list +} + +template<> +command_graph command_graph::finalize(const sycl::context &ctx) const { + return command_graph{ this->my_graph, ctx }; +} + +void command_graph::exec_and_wait(sycl::queue q) { my_graph->exec_and_wait(q); }; + +} // namespace experimental +} // namespace oneapi +} // namespace ext +} // namespace sycl +} // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 09de4b0bdaf44..058437e0285c2 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -872,6 +872,11 @@ bool _pi_queue::isInOrderQueue() const { return ((this->Properties & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) == 0); } +bool _pi_queue::isEagerExec() const { + // If lazy exec queue property is not set, then it's an eager queue. + return ((this->Properties & PI_QUEUE_LAZY_EXECUTION) == 0); +} + pi_result _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList, bool MakeAvailable) { bool UseCopyEngine = CommandList->second.isCopy(this); @@ -1135,137 +1140,142 @@ _pi_queue::_pi_queue(std::vector &ComputeQueues, } // Retrieve an available command list to be used in a PI call. -pi_result -_pi_context::getAvailableCommandList(pi_queue Queue, - pi_command_list_ptr_t &CommandList, - bool UseCopyEngine, bool AllowBatching) { - // Immediate commandlists have been pre-allocated and are always available. - if (UseImmediateCommandLists) { - CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList(); - return PI_SUCCESS; - } - - auto &CommandBatch = - UseCopyEngine ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch; - // Handle batching of commands - // First see if there is an command-list open for batching commands - // for this queue. - if (Queue->hasOpenCommandList(UseCopyEngine)) { - if (AllowBatching) { - CommandList = CommandBatch.OpenCommandList; +pi_result _pi_context::getAvailableCommandList( + pi_queue Queue, pi_command_list_ptr_t &CommandList, bool UseCopyEngine, + bool AllowBatching, bool Graph) { + // TODO: Do proper CommandList allocation. This is a hack! + if (!Graph) { + // Immediate commandlists have been pre-allocated and are always available. + if (UseImmediateCommandLists) { + CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList(); return PI_SUCCESS; } - // If this command isn't allowed to be batched, then we need to - // go ahead and execute what is already in the batched list, - // and then go on to process this. On exit from executeOpenCommandList - // OpenCommandList will be invalidated. - if (auto Res = Queue->executeOpenCommandList(UseCopyEngine)) - return Res; - } - // Create/Reuse the command list, because in Level Zero commands are added to - // the command lists, and later are then added to the command queue. - // Each command list is paired with an associated fence to track when the - // command list is available for reuse. - _pi_result pi_result = PI_OUT_OF_RESOURCES; - ZeStruct ZeFenceDesc; - // Initally, we need to check if a command list has already been created - // on this device that is available for use. If so, then reuse that - // Level-Zero Command List and Fence for this PI call. - { - // Make sure to acquire the lock before checking the size, or there - // will be a race condition. - std::lock_guard lock(Queue->Context->ZeCommandListCacheMutex); - // Under mutex since operator[] does insertion on the first usage for every - // unique ZeDevice. - auto &ZeCommandListCache = - UseCopyEngine - ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice] - : Queue->Context - ->ZeComputeCommandListCache[Queue->Device->ZeDevice]; - - if (ZeCommandListCache.size() > 0) { - auto &ZeCommandList = ZeCommandListCache.front(); - auto it = Queue->CommandListMap.find(ZeCommandList); - if (it != Queue->CommandListMap.end()) { - CommandList = it; - CommandList->second.InUse = true; - } else { - // If there is a command list available on this context, but it - // wasn't yet used in this queue then create a new entry in this - // queue's map to hold the fence and other associated command - // list information. - uint32_t QueueGroupOrdinal; - auto &ZeCommandQueue = - Queue->getQueueGroup(UseCopyEngine).getZeQueue(&QueueGroupOrdinal); - - ze_fence_handle_t ZeFence; - ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); - CommandList = - Queue->CommandListMap - .emplace(ZeCommandList, - pi_command_list_info_t{ZeFence, true, ZeCommandQueue, - QueueGroupOrdinal}) - .first; + auto &CommandBatch = + UseCopyEngine ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch; + // Handle batching of commands + // First see if there is an command-list open for batching commands + // for this queue. + if (Queue->hasOpenCommandList(UseCopyEngine)) { + if (AllowBatching) { + CommandList = CommandBatch.OpenCommandList; + return PI_SUCCESS; } - ZeCommandListCache.pop_front(); - return PI_SUCCESS; + // If this command isn't allowed to be batched, then we need to + // go ahead and execute what is already in the batched list, + // and then go on to process this. On exit from executeOpenCommandList + // OpenCommandList will be invalidated. + if (auto Res = Queue->executeOpenCommandList(UseCopyEngine)) + return Res; } - } - - // If there are no available command lists in the cache, then we check for - // command lists that have already signalled, but have not been added to the - // available list yet. Each command list has a fence associated which tracks - // if a command list has completed dispatch of its commands and is ready for - // reuse. If a command list is found to have been signalled, then the - // command list & fence are reset and we return. - for (auto it = Queue->CommandListMap.begin(); - it != Queue->CommandListMap.end(); ++it) { - // Make sure this is the command list type needed. - if (UseCopyEngine != it->second.isCopy(Queue)) - continue; - ze_result_t ZeResult = - ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence)); - if (ZeResult == ZE_RESULT_SUCCESS) { - Queue->resetCommandList(it, false); - CommandList = it; - CommandList->second.InUse = true; - return PI_SUCCESS; + // Create/Reuse the command list, because in Level Zero commands are added + // to the command lists, and later are then added to the command queue. Each + // command list is paired with an associated fence to track when the command + // list is available for reuse. + _pi_result pi_result = PI_OUT_OF_RESOURCES; + ZeStruct ZeFenceDesc; + // Initally, we need to check if a command list has already been created + // on this device that is available for use. If so, then reuse that + // Level-Zero Command List and Fence for this PI call. + { + // Make sure to acquire the lock before checking the size, or there + // will be a race condition. + std::lock_guard lock(Queue->Context->ZeCommandListCacheMutex); + // Under mutex since operator[] does insertion on the first usage for + // every unique ZeDevice. + auto &ZeCommandListCache = + UseCopyEngine + ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice] + : Queue->Context + ->ZeComputeCommandListCache[Queue->Device->ZeDevice]; + + if (ZeCommandListCache.size() > 0) { + auto &ZeCommandList = ZeCommandListCache.front(); + auto it = Queue->CommandListMap.find(ZeCommandList); + if (it != Queue->CommandListMap.end()) { + CommandList = it; + CommandList->second.InUse = true; + } else { + // If there is a command list available on this context, but it + // wasn't yet used in this queue then create a new entry in this + // queue's map to hold the fence and other associated command + // list information. + uint32_t QueueGroupOrdinal; + auto &ZeCommandQueue = Queue->getQueueGroup(UseCopyEngine) + .getZeQueue(&QueueGroupOrdinal); + + ze_fence_handle_t ZeFence; + ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); + CommandList = + Queue->CommandListMap + .emplace(ZeCommandList, + pi_command_list_info_t{ZeFence, true, ZeCommandQueue, + QueueGroupOrdinal}) + .first; + } + ZeCommandListCache.pop_front(); + return PI_SUCCESS; + } } - } - // If there are no available command lists nor signalled command lists, then - // we must create another command list if we have not exceed the maximum - // command lists we can create. - // Once created, this command list & fence are added to the command list fence - // map. - if (Queue->Device->Platform->ZeGlobalCommandListCount < - ZeMaxCommandListCacheSize) { - ze_command_list_handle_t ZeCommandList; - ze_fence_handle_t ZeFence; - - uint32_t QueueGroupOrdinal; - auto &ZeCommandQueue = - Queue->getQueueGroup(UseCopyEngine).getZeQueue(&QueueGroupOrdinal); - - ZeStruct ZeCommandListDesc; - ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; - - ZE_CALL(zeCommandListCreate, - (Queue->Context->ZeContext, Queue->Device->ZeDevice, - &ZeCommandListDesc, &ZeCommandList)); - // Increments the total number of command lists created on this platform. - Queue->Device->Platform->ZeGlobalCommandListCount++; + // If there are no available command lists in the cache, then we check for + // command lists that have already signalled, but have not been added to the + // available list yet. Each command list has a fence associated which tracks + // if a command list has completed dispatch of its commands and is ready for + // reuse. If a command list is found to have been signalled, then the + // command list & fence are reset and we return. + for (auto it = Queue->CommandListMap.begin(); + it != Queue->CommandListMap.end(); ++it) { + // Make sure this is the command list type needed. + if (UseCopyEngine != it->second.isCopy(Queue)) + continue; + + ze_result_t ZeResult = + ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence)); + if (ZeResult == ZE_RESULT_SUCCESS) { + Queue->resetCommandList(it, false); + CommandList = it; + CommandList->second.InUse = true; + return PI_SUCCESS; + } + } - ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); - std::tie(CommandList, std::ignore) = Queue->CommandListMap.insert( - std::pair( - ZeCommandList, {ZeFence, true, ZeCommandQueue, QueueGroupOrdinal})); - pi_result = PI_SUCCESS; + // If there are no available command lists nor signalled command lists, then + // we must create another command list if we have not exceed the maximum + // command lists we can create. + // Once created, this command list & fence are added to the command list + // fence map. + if (Queue->Device->Platform->ZeGlobalCommandListCount < + ZeMaxCommandListCacheSize) { + ze_command_list_handle_t ZeCommandList; + ze_fence_handle_t ZeFence; + + uint32_t QueueGroupOrdinal; + auto &ZeCommandQueue = + Queue->getQueueGroup(UseCopyEngine).getZeQueue(&QueueGroupOrdinal); + + ZeStruct ZeCommandListDesc; + ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; + + ZE_CALL(zeCommandListCreate, + (Queue->Context->ZeContext, Queue->Device->ZeDevice, + &ZeCommandListDesc, &ZeCommandList)); + // Increments the total number of command lists created on this platform. + Queue->Device->Platform->ZeGlobalCommandListCount++; + + ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); + std::tie(CommandList, std::ignore) = Queue->CommandListMap.insert( + std::pair( + ZeCommandList, + {ZeFence, true, ZeCommandQueue, QueueGroupOrdinal})); + pi_result = PI_SUCCESS; + } + return pi_result; + } else { + CommandList = Queue->CommandListMap.begin(); } - - return pi_result; + return PI_SUCCESS; } void _pi_queue::adjustBatchSizeForFullBatch(bool IsCopy) { @@ -1346,8 +1356,8 @@ void _pi_queue::CaptureIndirectAccesses() { } pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, - bool IsBlocking, - bool OKToBatchCommand) { + bool IsBlocking, bool OKToBatchCommand, + bool Graph) { bool UseCopyEngine = CommandList->second.isCopy(this); // If the current LastCommandEvent is the nullptr, then it means @@ -1466,6 +1476,8 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, } // Close the command list and have it ready for dispatch. + // TODO: Close command list only once before initial execution, but works as + // is. ZE_CALL(zeCommandListClose, (CommandList->first)); // Offload command list to the GPU for asynchronous execution auto ZeCommandList = CommandList->first; @@ -2051,10 +2063,11 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, } zePrint("Using events scope: %s\n", - EventsScope == AllHostVisible ? "all host-visible" - : EventsScope == OnDemandHostVisibleProxy - ? "on demand host-visible proxy" - : "only last command in a batch is host-visible"); + EventsScope == AllHostVisible + ? "all host-visible" + : EventsScope == OnDemandHostVisibleProxy + ? "on demand host-visible proxy" + : "only last command in a batch is host-visible"); return PI_SUCCESS; } @@ -3203,10 +3216,11 @@ pi_result piQueueCreate(pi_context Context, pi_device Device, pi_queue_properties Properties, pi_queue *Queue) { // Check that unexpected bits are not set. - PI_ASSERT(!(Properties & ~(PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | - PI_QUEUE_PROFILING_ENABLE | PI_QUEUE_ON_DEVICE | - PI_QUEUE_ON_DEVICE_DEFAULT)), - PI_INVALID_VALUE); + PI_ASSERT( + !(Properties & ~(PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | + PI_QUEUE_PROFILING_ENABLE | PI_QUEUE_ON_DEVICE | + PI_QUEUE_ON_DEVICE_DEFAULT | PI_QUEUE_LAZY_EXECUTION)), + PI_INVALID_VALUE); PI_ASSERT(Context, PI_INVALID_CONTEXT); PI_ASSERT(Queue, PI_INVALID_QUEUE); @@ -4929,12 +4943,12 @@ pi_result piKernelRelease(pi_kernel Kernel) { return PI_SUCCESS; } -pi_result -piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, - const size_t *GlobalWorkOffset, - const size_t *GlobalWorkSize, const size_t *LocalWorkSize, - pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, pi_event *Event) { +pi_result piEnqueueKernel(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, + const size_t *GlobalWorkOffset, + const size_t *GlobalWorkSize, + const size_t *LocalWorkSize, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { PI_ASSERT(Kernel, PI_INVALID_KERNEL); PI_ASSERT(Queue, PI_INVALID_QUEUE); PI_ASSERT(Event, PI_INVALID_EVENT); @@ -5099,14 +5113,55 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, pi_cast(ZeEvent)); printZeEventList((*Event)->WaitList); +#if 0 // Execute command list asynchronously, as the event will be used // to track down its completion. if (auto Res = Queue->executeCommandList(CommandList, false, true)) return Res; +#endif return PI_SUCCESS; } +pi_result piKernelLaunch(pi_queue Queue) { + + const bool Graph = !(Queue->isEagerExec()); + // const bool Graph = true; + + // TODO: Make sure (re-)execute specific command list. + + // Get a new command list to be used on this call + pi_command_list_ptr_t CommandList{}; + if (auto Res = Queue->Context->getAvailableCommandList( + Queue, CommandList, false /* PreferCopyEngine */, + true /* AllowBatching */, Graph /* Shortcut for Graph */)) + return Res; + + // Execute command list asynchronously, as the event will be used + // to track down its completion. + if (auto Res = Queue->executeCommandList(CommandList, false, true, Graph)) + return Res; + + return PI_SUCCESS; +} + +pi_result +piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, + const size_t *GlobalWorkOffset, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + auto Res = + piEnqueueKernel(Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, + LocalWorkSize, NumEventsInWaitList, EventWaitList, Event); +#if 1 + if (Res == PI_SUCCESS && Queue->isEagerExec()) { + return piKernelLaunch(Queue); + } +#endif + return Res; +} + pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle, pi_context Context, pi_program Program, diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index a0f9ed530baf3..47eac0846f43a 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -636,6 +636,10 @@ struct _pi_context : _pi_object { std::unordered_map> ZeCopyCommandListCache; + // Single command list for graph api + + std::list ZeGraphCommandList; + // Retrieves a command list for executing on this device along with // a fence to be used in tracking the execution of this command list. // If a command list has been created on this device which has @@ -656,7 +660,8 @@ struct _pi_context : _pi_object { pi_result getAvailableCommandList(pi_queue Queue, pi_command_list_ptr_t &CommandList, bool UseCopyEngine = false, - bool AllowBatching = false); + bool AllowBatching = false, + bool Graph = false); // Get index of the free slot in the available pool. If there is no available // pool then create new one. The HostVisible parameter tells if we need a @@ -872,6 +877,8 @@ struct _pi_queue : _pi_object { // Returns true if the queue is a in-order queue. bool isInOrderQueue() const; + bool isEagerExec() const; + // adjust the queue's batch size, knowing that the current command list // is being closed with a full batch. // For copy commands, IsCopy is set to 'true'. @@ -912,7 +919,8 @@ struct _pi_queue : _pi_object { // For immediate commandlists, no close and execute is necessary. pi_result executeCommandList(pi_command_list_ptr_t CommandList, bool IsBlocking = false, - bool OKToBatchCommand = false); + bool OKToBatchCommand = false, + bool Graph = false); // If there is an open command list associated with this queue, // close it, execute it, and reset the corresponding OpenCommandList. diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 63e4132403ac7..7aefc92a3567f 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -276,6 +276,14 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId); #endif +#if 1 + if (has_property()) { + const detail::plugin &Plugin = getPlugin(); + if (Plugin.getBackend() == backend::ext_oneapi_level_zero) + Plugin.call(getHandleRef()); + } +#endif + std::vector> WeakEvents; std::vector SharedEvents; { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 0c05db22d126a..504c28d1b435c 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -307,6 +307,9 @@ class queue_impl { ext::oneapi::cuda::property::queue::use_default_stream>()) { CreationFlags |= __SYCL_PI_CUDA_USE_DEFAULT_STREAM; } + if (has_property()) { + CreationFlags |= PI_QUEUE_LAZY_EXECUTION; + } RT::PiQueue Queue{}; RT::PiContext Context = MContext->getHandleRef(); RT::PiDevice Device = MDevice->getHandleRef();