diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 492e43534dcdd..2246463d17255 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -52,9 +52,10 @@ // 10.13 Added new PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS queue property. // 10.14 Add PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY as an extension for // piDeviceGetInfo. +// 10.15 Add new PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION queue property #define _PI_H_VERSION_MAJOR 10 -#define _PI_H_VERSION_MINOR 14 +#define _PI_H_VERSION_MINOR 15 #define _PI_STRING_HELPER(a) #a #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) @@ -569,6 +570,14 @@ constexpr pi_queue_properties PI_QUEUE_PROFILING_ENABLE = (1 << 1); constexpr pi_queue_properties PI_QUEUE_ON_DEVICE = (1 << 2); constexpr pi_queue_properties PI_QUEUE_ON_DEVICE_DEFAULT = (1 << 3); constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS = (1 << 4); +// In a lazy queue, enqueued commands are not submitted for execution +// immediately, instead they are submitted for execution once the queue is +// flushed. +// +// This is to enable prototyping of the SYCL_EXT_ONEAPI_GRAPH extension, +// before a native command-list interface in PI can be designed and +// implemented. +constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION = (1 << 5); using pi_result = _pi_result; using pi_platform_info = _pi_platform_info; diff --git a/sycl/include/sycl/detail/property_helper.hpp b/sycl/include/sycl/detail/property_helper.hpp index ff0f4aa8568b0..5397da6053000 100644 --- a/sycl/include/sycl/detail/property_helper.hpp +++ b/sycl/include/sycl/detail/property_helper.hpp @@ -35,8 +35,9 @@ enum DataLessPropKind { UseDefaultStream = 8, DiscardEvents = 9, DeviceReadOnly = 10, + LazyExecution = 11, // Indicates the last known dataless property. - LastKnownDataLessPropKind = 10, + LastKnownDataLessPropKind = 11, // Exceeding 32 may cause ABI breaking change on some of OSes. DataLessPropKindSize = 32 }; diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp new file mode 100644 index 0000000000000..c172f23c29b8a --- /dev/null +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -0,0 +1,106 @@ +//==--------- graph.hpp --- SYCL graph extension ---------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +namespace sycl { +__SYCL_INLINE_VER_NAMESPACE(_V1) { + +class handler; +namespace ext { +namespace oneapi { +namespace experimental { + +namespace detail { +struct node_impl; +struct graph_impl; + +using node_ptr = std::shared_ptr; +using graph_ptr = std::shared_ptr; +} // namespace detail + +enum class graph_state { + modifiable, + executable, +}; + +class __SYCL_EXPORT node { +private: + node(detail::node_ptr Impl) : impl(Impl) {} + + template + friend decltype(Obj::impl) + sycl::detail::getSyclObjImpl(const Obj &SyclObject); + template + friend T sycl::detail::createSyclObjFromImpl(decltype(T::impl) ImplObj); + + detail::node_ptr impl; + detail::graph_ptr MGraph; +}; + +template +class __SYCL_EXPORT command_graph { +public: + command_graph(); + + // Adding empty node with [0..n] predecessors: + node add(const std::vector &dep = {}); + + // Adding device node: + template node add(T cgf, const std::vector &dep = {}) { + return add_impl(cgf, dep); + } + + // Adding dependency between two nodes. + void make_edge(node sender, node receiver); + + command_graph + finalize(const sycl::context &syclContext) const; + +private: + command_graph(detail::graph_ptr Impl) : impl(Impl) {} + + // Template-less implementation of add() + node add_impl(std::function cgf, + const std::vector &dep); + + template + friend decltype(Obj::impl) + sycl::detail::getSyclObjImpl(const Obj &SyclObject); + template + friend T sycl::detail::createSyclObjFromImpl(decltype(T::impl) ImplObj); + + detail::graph_ptr impl; +}; + +template <> class __SYCL_EXPORT command_graph { +public: + command_graph() = delete; + + command_graph(detail::graph_ptr g, const sycl::context &ctx) + : MTag(rand()), MCtx(ctx), impl(g) {} + +private: + template + friend decltype(Obj::impl) + sycl::detail::getSyclObjImpl(const Obj &SyclObject); + + int MTag; + const sycl::context &MCtx; + detail::graph_ptr impl; +}; +} // namespace experimental +} // namespace oneapi +} // namespace ext +} // __SYCL_INLINE_VER_NAMESPACE(_V1) +} // namespace sycl diff --git a/sycl/include/sycl/feature_test.hpp.in b/sycl/include/sycl/feature_test.hpp.in index 59d2ebed77e39..1b21acfcbf56a 100644 --- a/sycl/include/sycl/feature_test.hpp.in +++ b/sycl/include/sycl/feature_test.hpp.in @@ -47,6 +47,7 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) { #define SYCL_EXT_ONEAPI_FREE_FUNCTION_QUERIES 1 #define SYCL_EXT_ONEAPI_GROUP_ALGORITHMS 1 #define SYCL_EXT_ONEAPI_GROUP_SORT 1 +#define SYCL_EXT_ONEAPI_LAZY_QUEUE 1 #define SYCL_EXT_ONEAPI_MAX_WORK_GROUP_QUERY 1 #define SYCL_EXT_ONEAPI_ND_RANGE_REDUCTIONS 1 #define SYCL_EXT_ONEAPI_DEFAULT_CONTEXT 1 diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 335daae9904d6..42e489c3eea1a 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -30,6 +30,8 @@ #include #include +#include + #include #include #include @@ -2516,6 +2518,13 @@ class __SYCL_EXPORT handler { /// \param Advice is a device-defined advice for the specified allocation. void mem_advise(const void *Ptr, size_t Length, int Advice); + /// Executes a command_graph. + /// + /// \param Graph Executable command_graph to run + void exec_graph(ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable> + Graph); + private: std::shared_ptr MImpl; std::shared_ptr MQueue; diff --git a/sycl/include/sycl/properties/queue_properties.hpp b/sycl/include/sycl/properties/queue_properties.hpp index 2457b4c561dcf..643f16fbe7cb8 100644 --- a/sycl/include/sycl/properties/queue_properties.hpp +++ b/sycl/include/sycl/properties/queue_properties.hpp @@ -28,6 +28,8 @@ namespace property { namespace queue { class discard_events : public ::sycl::detail::DataLessProperty<::sycl::detail::DiscardEvents> {}; +class lazy_execution + : public ::sycl::detail::DataLessProperty<::sycl::detail::LazyExecution> {}; } // namespace queue } // namespace property @@ -65,6 +67,9 @@ template <> struct is_property : std::true_type {}; template <> +struct is_property + : std::true_type {}; +template <> struct is_property : std::true_type { }; template <> @@ -80,6 +85,9 @@ template <> struct is_property_of : std::true_type {}; template <> +struct is_property_of + : std::true_type {}; +template <> struct is_property_of : std::true_type {}; template <> diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index fd2cf7736dc3a..3aacc460e6d29 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -23,6 +23,7 @@ #include #include + // Explicitly request format macros #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS 1 @@ -1058,6 +1059,55 @@ class __SYCL_EXPORT queue { // Clean KERNELFUNC macros. #undef _KERNELFUNCPARAM + /// Shortcut for executing a graph of commands. + /// + /// \param Graph the graph of commands to execute + /// \return an event representing graph execution operation. + event exec_graph(ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable> + Graph) { + const detail::code_location CodeLoc = {}; + return submit([&](handler &CGH) { CGH.exec_graph(Graph); }, CodeLoc); + } + + /// Shortcut for executing a graph of commands. + /// + /// \param Graph the graph of commands to execute + /// \param DepEvent is an event that specifies the graph execution + /// dependencies. + /// \return an event representing graph execution operation. + event exec_graph(ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable> + Graph, + event DepEvent) { + const detail::code_location CodeLoc = {}; + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvent); + CGH.exec_graph(Graph); + }, + CodeLoc); + } + + /// Shortcut for executing a graph of commands. + /// + /// \param Graph the graph of commands to execute + /// \param DepEvents is a vector of events that specifies the graph + /// execution dependencies. + /// \return an event representing graph execution operation. + event exec_graph(ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable> + Graph, + const std::vector &DepEvents) { + const detail::code_location CodeLoc = {}; + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.exec_graph(Graph); + }, + CodeLoc); + } + /// Returns whether the queue is in order or OoO /// /// Equivalent to has_property() diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 54ed20383d9a9..1a6937e47498d 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1306,6 +1306,45 @@ pi_result resetCommandLists(pi_queue Queue) { pi_result _pi_context::getAvailableCommandList( pi_queue Queue, pi_command_list_ptr_t &CommandList, bool UseCopyEngine, bool AllowBatching, ze_command_queue_handle_t *ForcedCmdQueue) { + + // This is a hack. TODO: Proper CommandList allocation per Executable Graph. + if( Queue->Properties & PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION ) { + // TODO: Create new Command List. + if(Queue->LazyCommandListMap.empty()) { + const bool UseCopyEngine = false; + // Adding createCommandList() to LazyCommandListMap + ze_fence_handle_t ZeFence; + ZeStruct ZeFenceDesc; + ze_command_list_handle_t ZeCommandList; + + uint32_t QueueGroupOrdinal; + auto &QGroup = Queue->getQueueGroup(UseCopyEngine); + auto &ZeCommandQueue = + //ForcedCmdQueue ? *ForcedCmdQueue : + QGroup.getZeQueue(&QueueGroupOrdinal); + //if (ForcedCmdQueue) + // QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue); + + ZeStruct ZeCommandListDesc; + ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; + + ZE_CALL(zeCommandListCreate, (Queue->Context->ZeContext, Queue->Device->ZeDevice, + &ZeCommandListDesc, &ZeCommandList)); + + ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); + std::tie(CommandList, std::ignore) = Queue->LazyCommandListMap.insert( + std::pair( + ZeCommandList, {ZeFence, false, ZeCommandQueue, QueueGroupOrdinal})); + + Queue->insertActiveBarriers(CommandList, UseCopyEngine); + // + CommandList->second.ZeFenceInUse = true; + } else { + CommandList = Queue->LazyCommandListMap.begin(); + } + return PI_SUCCESS; + } + // Immediate commandlists have been pre-allocated and are always available. if (Queue->Device->useImmediateCommandLists()) { CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList(); @@ -1544,6 +1583,9 @@ void _pi_queue::CaptureIndirectAccesses() { pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, bool IsBlocking, bool OKToBatchCommand) { + // When executing a Graph, defer execution + if( this->Properties & PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION ) return PI_SUCCESS; + bool UseCopyEngine = CommandList->second.isCopy(this); // If the current LastCommandEvent is the nullptr, then it means @@ -3509,7 +3551,8 @@ pi_result piQueueCreate(pi_context Context, pi_device Device, PI_ASSERT(!(Properties & ~(PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | PI_QUEUE_PROFILING_ENABLE | PI_QUEUE_ON_DEVICE | PI_QUEUE_ON_DEVICE_DEFAULT | - PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS)), + PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS | + PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION)), PI_ERROR_INVALID_VALUE); PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); @@ -3783,6 +3826,31 @@ pi_result piQueueFinish(pi_queue Queue) { // Flushing cross-queue dependencies is covered by createAndRetainPiZeEventList, // so this can be left as a no-op. pi_result piQueueFlush(pi_queue Queue) { + if( Queue->Properties & PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION ) { + pi_command_list_ptr_t CommandList{}; + // TODO: + CommandList = Queue->LazyCommandListMap.begin(); + + auto &ZeCommandQueue = CommandList->second.ZeQueue; + // Scope of the lock must be till the end of the function, otherwise new mem + // allocs can be created between the moment when we made a snapshot and the + // moment when command list is closed and executed. But mutex is locked only + // if indirect access tracking enabled, because std::defer_lock is used. + // unique_lock destructor at the end of the function will unlock the mutex + // if it was locked (which happens only if IndirectAccessTrackingEnabled is + // true). + std::unique_lock ContextsLock( + Queue->Device->Platform->ContextsMutex, std::defer_lock); + + // Close the command list and have it ready for dispatch. + ZE_CALL(zeCommandListClose, (CommandList->first)); + + // Offload command list to the GPU for asynchronous execution + auto ZeCommandList = CommandList->first; + auto ZeResult = ZE_CALL_NOCHECK( + zeCommandQueueExecuteCommandLists, + (ZeCommandQueue, 1, &ZeCommandList, CommandList->second.ZeFence)); + } (void)Queue; return PI_SUCCESS; } diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index f6b31229f0535..25fda46b65d89 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -937,6 +937,8 @@ struct _pi_queue : _pi_object { // Map of all command lists used in this queue. pi_command_list_map_t CommandListMap; + // TODO: Assign Graph related command lists to command_graph object + pi_command_list_map_t LazyCommandListMap; // Helper data structure to hold all variables related to batching typedef struct CommandBatch { diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index 23b161480b17a..1a92f6bfa5f2f 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -132,6 +132,7 @@ set(SYCL_SOURCES "detail/filter_selector_impl.cpp" "detail/force_device.cpp" "detail/global_handler.cpp" + "detail/graph_impl.cpp" "detail/helpers.cpp" "detail/handler_proxy.cpp" "detail/image_accessor_util.cpp" diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp new file mode 100644 index 0000000000000..d478d2e302026 --- /dev/null +++ b/sycl/source/detail/graph_impl.cpp @@ -0,0 +1,120 @@ +//==--------- graph_impl.cpp - SYCL graph extension -----------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +namespace sycl { +__SYCL_INLINE_VER_NAMESPACE(_V1) { + +namespace detail { +struct queue_impl; +using queue_ptr = std::shared_ptr; +} // namespace detail + +namespace ext { +namespace oneapi { +namespace experimental { +namespace detail { + +void graph_impl::exec(sycl::detail::queue_ptr q) { + if (MSchedule.empty()) { + for (auto n : MRoots) { + n->topology_sort(MSchedule); + } + } + for (auto n : MSchedule) + n->exec(q); +} + +void graph_impl::exec_and_wait(sycl::detail::queue_ptr q) { + if (MFirst) { + exec(q); + MFirst = false; + } + q->wait(); +} + +void graph_impl::add_root(node_ptr n) { + MRoots.insert(n); + for (auto n : MSchedule) + n->MScheduled = false; + MSchedule.clear(); +} + +void graph_impl::remove_root(node_ptr n) { + MRoots.erase(n); + for (auto n : MSchedule) + n->MScheduled = false; + MSchedule.clear(); +} + +template +node_ptr graph_impl::add(graph_ptr impl, T cgf, + const std::vector &dep) { + node_ptr nodeImpl = std::make_shared(impl, cgf); + if (!dep.empty()) { + for (auto n : dep) { + n->register_successor(nodeImpl); // register successor + this->remove_root(nodeImpl); // remove receiver from root node + // list + } + } else { + this->add_root(nodeImpl); + } + return nodeImpl; +} + +void node_impl::exec(sycl::detail::queue_ptr q) { + std::vector deps; + for (auto i : MPredecessors) + deps.push_back(i->get_event()); + + const sycl::detail::code_location CodeLoc; + MEvent = q->submit(wrapper{MBody, deps}, q, CodeLoc); +} +} // namespace detail + +template <> +command_graph::command_graph() + : impl(std::make_shared()) {} + +template <> +node command_graph::add_impl( + std::function cgf, const std::vector &dep) { + std::vector depImpls; + for (auto &d : dep) { + depImpls.push_back(sycl::detail::getSyclObjImpl(d)); + } + + auto nodeImpl = impl->add(impl, cgf, depImpls); + return sycl::detail::createSyclObjFromImpl(nodeImpl); +} + +template <> +void command_graph::make_edge(node sender, + node receiver) { + auto sender_impl = sycl::detail::getSyclObjImpl(sender); + auto receiver_impl = sycl::detail::getSyclObjImpl(receiver); + + sender_impl->register_successor(receiver_impl); // register successor + impl->remove_root(receiver_impl); // remove receiver from root node list +} + +template <> +command_graph +command_graph::finalize( + const sycl::context &ctx) const { + return command_graph{this->impl, ctx}; +} + +} // namespace experimental +} // namespace oneapi +} // namespace ext +} // __SYCL_INLINE_VER_NAMESPACE(_V1) +} // namespace sycl diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp new file mode 100644 index 0000000000000..cb4f5cb2d6e81 --- /dev/null +++ b/sycl/source/detail/graph_impl.hpp @@ -0,0 +1,106 @@ +//==--------- graph_impl.hpp --- SYCL graph extension ---------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include +#include +#include + +namespace sycl { +__SYCL_INLINE_VER_NAMESPACE(_V1) { + +namespace detail { +struct queue_impl; +using queue_ptr = std::shared_ptr; +} // namespace detail + +namespace ext { +namespace oneapi { +namespace experimental { +namespace detail { + +class wrapper { + using T = std::function; + T MFunc; + std::vector MDeps; + +public: + wrapper(T t, const std::vector &deps) : MFunc(t), MDeps(deps){}; + + void operator()(sycl::handler &cgh) { + cgh.depends_on(MDeps); + std::invoke(MFunc, cgh); + } +}; + +struct node_impl { + bool MScheduled; + + graph_ptr MGraph; + sycl::event MEvent; + + std::vector MSuccessors; + std::vector MPredecessors; + + std::function MBody; + + void exec(sycl::detail::queue_ptr q); + + void register_successor(node_ptr n) { + MSuccessors.push_back(n); + n->register_predecessor(node_ptr(this)); + } + + void register_predecessor(node_ptr n) { MPredecessors.push_back(n); } + + sycl::event get_event(void) const { return MEvent; } + + template + node_impl(graph_ptr g, T cgf) : MScheduled(false), MGraph(g), MBody(cgf) {} + + // Recursively adding nodes to execution stack: + void topology_sort(std::list &schedule) { + MScheduled = true; + for (auto i : MSuccessors) { + if (!i->MScheduled) + i->topology_sort(schedule); + } + schedule.push_front(node_ptr(this)); + } +}; + +struct graph_impl { + std::set MRoots; + std::list MSchedule; + // TODO: Change one time initialization to per executable object + bool MFirst; + + graph_ptr MParent; + + void exec(sycl::detail::queue_ptr q); + void exec_and_wait(sycl::detail::queue_ptr q); + + void add_root(node_ptr n); + void remove_root(node_ptr n); + + template + node_ptr add(graph_ptr impl, T cgf, const std::vector &dep = {}); + + graph_impl() : MFirst(true) {} +}; + +} // namespace detail +} // namespace experimental +} // namespace oneapi +} // namespace ext +} // __SYCL_INLINE_VER_NAMESPACE(_V1) +} // namespace sycl diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index f55b845b1c84e..e6caedcc66bcc 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -278,6 +278,12 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId); #endif + if (has_property()) { + const detail::plugin &Plugin = getPlugin(); + if (Plugin.getBackend() == backend::ext_oneapi_level_zero) + Plugin.call(getHandleRef()); + } + std::vector> WeakEvents; std::vector SharedEvents; { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 3e3ec40bc2fa9..2e217912ee6c1 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -316,6 +316,9 @@ class queue_impl { // queue property. CreationFlags |= PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS; } + if (MPropList.has_property()) { + CreationFlags |= PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION; + } RT::PiQueue Queue{}; RT::PiContext Context = MContext->getHandleRef(); RT::PiDevice Device = MDevice->getHandleRef(); diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 7919031827cfd..d95ef9a8bd4d5 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -698,5 +699,12 @@ void handler::depends_on(const std::vector &Events) { } } +void handler::exec_graph(ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable> + Graph) { + auto GraphImpl = detail::getSyclObjImpl(Graph); + GraphImpl->exec_and_wait(MQueue); +} + } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/test/graph/graph-explicit-dotp.cpp b/sycl/test/graph/graph-explicit-dotp.cpp new file mode 100644 index 0000000000000..561057bc13f84 --- /dev/null +++ b/sycl/test/graph/graph-explicit-dotp.cpp @@ -0,0 +1,98 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +#include +#include + +#include + +const size_t n = 10; + +float host_gold_result() { + float alpha = 1.0f; + float beta = 2.0f; + float gamma = 3.0f; + + float sum = 0.0f; + + for (size_t i = 0; i < n; ++i) { + sum += (alpha * 1.0f + beta * 2.0f) * (gamma * 3.0f + beta * 2.0f); + } + + return sum; +} + +int main() { + float alpha = 1.0f; + float beta = 2.0f; + float gamma = 3.0f; + + sycl::property_list properties{ + sycl::property::queue::in_order{}, + sycl::ext::oneapi::property::queue::lazy_execution{}}; + + sycl::queue q{sycl::gpu_selector_v, properties}; + + sycl::ext::oneapi::experimental::command_graph g; + + float *dotp = sycl::malloc_shared(1, q); + + float *x = sycl::malloc_shared(n, q); + float *y = sycl::malloc_shared(n, q); + float *z = sycl::malloc_shared(n, q); + + /* init data on the device */ + auto n_i = g.add([&](sycl::handler &h) { + h.parallel_for(n, [=](sycl::id<1> it) { + const size_t i = it[0]; + x[i] = 1.0f; + y[i] = 2.0f; + z[i] = 3.0f; + }); + }); + + auto node_a = g.add( + [&](sycl::handler &h) { + h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { + const size_t i = it[0]; + x[i] = alpha * x[i] + beta * y[i]; + }); + }, + {n_i}); + + auto node_b = g.add( + [&](sycl::handler &h) { + h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { + const size_t i = it[0]; + z[i] = gamma * z[i] + beta * y[i]; + }); + }, + {n_i}); + + auto node_c = g.add( + [&](sycl::handler &h) { + h.parallel_for(sycl::range<1>{n}, + sycl::reduction(dotp, 0.0f, std::plus()), + [=](sycl::id<1> it, auto &sum) { + const size_t i = it[0]; + sum += x[i] * z[i]; + }); + }, + {node_a, node_b}); + + auto executable_graph = g.finalize(q.get_context()); + + // Using shortcut for executing a graph of commands + q.exec_graph(executable_graph).wait(); + + if (*dotp != host_gold_result()) { + std::cout << "Error unexpected result!\n"; + } + + sycl::free(dotp, q); + sycl::free(x, q); + sycl::free(y, q); + sycl::free(z, q); + + std::cout << "done.\n"; + + return 0; +} diff --git a/sycl/test/graph/graph-explicit-queue-shortcuts.cpp b/sycl/test/graph/graph-explicit-queue-shortcuts.cpp new file mode 100644 index 0000000000000..19d74ae895e86 --- /dev/null +++ b/sycl/test/graph/graph-explicit-queue-shortcuts.cpp @@ -0,0 +1,39 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +#include +#include + +#include + +int main() { + + sycl::property_list properties{ + sycl::property::queue::in_order{}, + sycl::ext::oneapi::property::queue::lazy_execution{}}; + + sycl::queue q{sycl::gpu_selector_v, properties}; + + sycl::ext::oneapi::experimental::command_graph g; + + const size_t n = 10; + float *arr = sycl::malloc_shared(n, q); + + g.add([&](sycl::handler &h) { + h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx) { + size_t i = idx; + arr[i] = 1; + }); + }); + + auto executable_graph = g.finalize(q.get_context()); + + auto e1 = q.exec_graph(executable_graph); + auto e2 = q.exec_graph(executable_graph, e1); + auto e3 = q.exec_graph(executable_graph, e1); + q.exec_graph(executable_graph, {e2, e3}).wait(); + + sycl::free(arr, q); + + std::cout << "done " << arr[0] << std::endl; + + return 0; +} diff --git a/sycl/test/graph/graph-explicit-reduction.cpp b/sycl/test/graph/graph-explicit-reduction.cpp new file mode 100644 index 0000000000000..9a2788079570c --- /dev/null +++ b/sycl/test/graph/graph-explicit-reduction.cpp @@ -0,0 +1,37 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +#include +#include + +#include + +int main() { + sycl::property_list properties{ + sycl::property::queue::in_order{}, + sycl::ext::oneapi::property::queue::lazy_execution{}}; + + sycl::queue q{sycl::gpu_selector_v, properties}; + + sycl::ext::oneapi::experimental::command_graph g; + + const size_t n = 10; + float *input = sycl::malloc_shared(n, q); + float *output = sycl::malloc_shared(1, q); + for (size_t i = 0; i < n; i++) { + input[i] = i; + } + + auto e = q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<1>{n}, + sycl::reduction(output, 0.0f, std::plus()), + [=](sycl::id<1> idx, auto &sum) { sum += input[idx]; }); + }); + + e.wait(); + + sycl::free(input, q); + sycl::free(output, q); + + std::cout << "done\n"; + + return 0; +} diff --git a/sycl/test/graph/graph-explicit-simple.cpp b/sycl/test/graph/graph-explicit-simple.cpp new file mode 100644 index 0000000000000..1e7bf1fec9afb --- /dev/null +++ b/sycl/test/graph/graph-explicit-simple.cpp @@ -0,0 +1,42 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +#include +#include + +#include + +int main() { + + sycl::property_list properties{ + sycl::property::queue::in_order{}, + sycl::ext::oneapi::property::queue::lazy_execution{}}; + + sycl::queue q{sycl::gpu_selector_v, properties}; + + sycl::ext::oneapi::experimental::command_graph g; + + const size_t n = 10; + float *arr = sycl::malloc_shared(n, q); + + g.add([&](sycl::handler &h) { + h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx) { + size_t i = idx; + arr[i] = 1; + }); + }); + + auto result_before_exec1 = arr[0]; + + auto executable_graph = g.finalize(q.get_context()); + + auto result_before_exec2 = arr[0]; + + q.submit([&](sycl::handler &h) { h.exec_graph(executable_graph); }); + + auto result = arr[0]; + + sycl::free(arr, q); + + std::cout << "done.\n"; + + return 0; +}