Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
1acf57e
Inital version of sycl graph prototype
reble Feb 18, 2022
d286c71
Adding initial sycl graph doc
reble Feb 18, 2022
656f5c3
Adding lazy execution property to queue
reble Feb 15, 2022
0bad787
fix merge
reble Feb 22, 2022
a8b5b32
Update pi_level_zero.cpp
reble Feb 22, 2022
2b50af4
update extension proposal started to incorporate feedback
reble Mar 11, 2022
047839b
typo
reble Mar 11, 2022
f957996
fix typos and syntax issues
reble May 3, 2022
0d8a5f4
Apply suggestions from code review
reble Mar 14, 2022
50d49a1
Propagate lazy queue property
julianmi May 3, 2022
9b46c4b
fix formatting issues
reble May 6, 2022
7d81618
fix issue introd. by recent merge
reble May 6, 2022
7917086
fix formatting
reble May 10, 2022
a3164de
update API to recent proposal
reble Oct 12, 2022
8850b18
fix rebase issue
reble Oct 12, 2022
446ac53
revert changes to level-zero plugin
reble Oct 18, 2022
fa7494d
starting to rework lazy execution logic
reble Oct 18, 2022
7581915
bugfix
reble Oct 18, 2022
38da3c6
add basic tests
reble Oct 18, 2022
fa58aa3
renaming macro and bugfix
reble Oct 20, 2022
4478390
clang-format
reble Nov 1, 2022
383459c
Renaming variables
reble Nov 1, 2022
f71ea49
Common changes from record & replay API (#32)
EwanC Nov 21, 2022
df971e5
[SYCL] Minor graph classes refactor (#36)
Bensuo Nov 24, 2022
2cf9d0f
Cosmetic changes
reble Nov 30, 2022
9f127d7
[SYCL] Repro for reduction fail
Nov 18, 2022
578692f
[SYCL] PIMPL refactor
Nov 24, 2022
7bb11ce
[SYCL] Use handler to execute graph
Nov 30, 2022
3073cfc
[SYCL] Clean-up lazy queue PI changes
Dec 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion sycl/include/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,10 @@
// 10.13 Added new PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS queue property.
// 10.14 Add PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY as an extension for
// piDeviceGetInfo.
// 10.15 Add new PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION queue property

#define _PI_H_VERSION_MAJOR 10
#define _PI_H_VERSION_MINOR 14
#define _PI_H_VERSION_MINOR 15

#define _PI_STRING_HELPER(a) #a
#define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
Expand Down Expand Up @@ -569,6 +570,14 @@ constexpr pi_queue_properties PI_QUEUE_PROFILING_ENABLE = (1 << 1);
constexpr pi_queue_properties PI_QUEUE_ON_DEVICE = (1 << 2);
constexpr pi_queue_properties PI_QUEUE_ON_DEVICE_DEFAULT = (1 << 3);
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS = (1 << 4);
// In a lazy queue, enqueued commands are not submitted for execution
// immediately, instead they are submitted for execution once the queue is
// flushed.
//
// This is to enable prototyping of the SYCL_EXT_ONEAPI_GRAPH extension,
// before a native command-list interface in PI can be designed and
// implemented.
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION = (1 << 5);

using pi_result = _pi_result;
using pi_platform_info = _pi_platform_info;
Expand Down
3 changes: 2 additions & 1 deletion sycl/include/sycl/detail/property_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@ enum DataLessPropKind {
UseDefaultStream = 8,
DiscardEvents = 9,
DeviceReadOnly = 10,
LazyExecution = 11,
// Indicates the last known dataless property.
LastKnownDataLessPropKind = 10,
LastKnownDataLessPropKind = 11,
// Exceeding 32 may cause ABI breaking change on some of OSes.
DataLessPropKindSize = 32
};
Expand Down
106 changes: 106 additions & 0 deletions sycl/include/sycl/ext/oneapi/experimental/graph.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
//==--------- graph.hpp --- SYCL graph extension ---------------------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#pragma once

#include <memory>
#include <sycl/detail/common.hpp>
#include <sycl/detail/defines_elementary.hpp>
#include <vector>

namespace sycl {
__SYCL_INLINE_VER_NAMESPACE(_V1) {

class handler;
namespace ext {
namespace oneapi {
namespace experimental {

namespace detail {
struct node_impl;
struct graph_impl;

using node_ptr = std::shared_ptr<node_impl>;
using graph_ptr = std::shared_ptr<graph_impl>;
} // namespace detail

enum class graph_state {
modifiable,
executable,
};

class __SYCL_EXPORT node {
private:
node(detail::node_ptr Impl) : impl(Impl) {}

template <class Obj>
friend decltype(Obj::impl)
sycl::detail::getSyclObjImpl(const Obj &SyclObject);
template <class T>
friend T sycl::detail::createSyclObjFromImpl(decltype(T::impl) ImplObj);

detail::node_ptr impl;
detail::graph_ptr MGraph;
};

template <graph_state State = graph_state::modifiable>
class __SYCL_EXPORT command_graph {
public:
command_graph();

// Adding empty node with [0..n] predecessors:
node add(const std::vector<node> &dep = {});

// Adding device node:
template <typename T> node add(T cgf, const std::vector<node> &dep = {}) {
return add_impl(cgf, dep);
}

// Adding dependency between two nodes.
void make_edge(node sender, node receiver);

command_graph<graph_state::executable>
finalize(const sycl::context &syclContext) const;

private:
command_graph(detail::graph_ptr Impl) : impl(Impl) {}

// Template-less implementation of add()
node add_impl(std::function<void(handler &)> cgf,
const std::vector<node> &dep);

template <class Obj>
friend decltype(Obj::impl)
sycl::detail::getSyclObjImpl(const Obj &SyclObject);
template <class T>
friend T sycl::detail::createSyclObjFromImpl(decltype(T::impl) ImplObj);

detail::graph_ptr impl;
};

template <> class __SYCL_EXPORT command_graph<graph_state::executable> {
public:
command_graph() = delete;

command_graph(detail::graph_ptr g, const sycl::context &ctx)
: MTag(rand()), MCtx(ctx), impl(g) {}

private:
template <class Obj>
friend decltype(Obj::impl)
sycl::detail::getSyclObjImpl(const Obj &SyclObject);

int MTag;
const sycl::context &MCtx;
detail::graph_ptr impl;
};
} // namespace experimental
} // namespace oneapi
} // namespace ext
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
} // namespace sycl
1 change: 1 addition & 0 deletions sycl/include/sycl/feature_test.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
#define SYCL_EXT_ONEAPI_FREE_FUNCTION_QUERIES 1
#define SYCL_EXT_ONEAPI_GROUP_ALGORITHMS 1
#define SYCL_EXT_ONEAPI_GROUP_SORT 1
#define SYCL_EXT_ONEAPI_LAZY_QUEUE 1
#define SYCL_EXT_ONEAPI_MAX_WORK_GROUP_QUERY 1
#define SYCL_EXT_ONEAPI_ND_RANGE_REDUCTIONS 1
#define SYCL_EXT_ONEAPI_DEFAULT_CONTEXT 1
Expand Down
9 changes: 9 additions & 0 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
#include <sycl/sampler.hpp>
#include <sycl/stl.hpp>

#include <sycl/ext/oneapi/experimental/graph.hpp>

#include <functional>
#include <limits>
#include <memory>
Expand Down Expand Up @@ -2516,6 +2518,13 @@ class __SYCL_EXPORT handler {
/// \param Advice is a device-defined advice for the specified allocation.
void mem_advise(const void *Ptr, size_t Length, int Advice);

/// Executes a command_graph.
///
/// \param Graph Executable command_graph to run
void exec_graph(ext::oneapi::experimental::command_graph<
ext::oneapi::experimental::graph_state::executable>
Graph);

private:
std::shared_ptr<detail::handler_impl> MImpl;
std::shared_ptr<detail::queue_impl> MQueue;
Expand Down
8 changes: 8 additions & 0 deletions sycl/include/sycl/properties/queue_properties.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ namespace property {
namespace queue {
class discard_events
: public ::sycl::detail::DataLessProperty<::sycl::detail::DiscardEvents> {};
class lazy_execution
: public ::sycl::detail::DataLessProperty<::sycl::detail::LazyExecution> {};
} // namespace queue
} // namespace property

Expand Down Expand Up @@ -65,6 +67,9 @@ template <>
struct is_property<ext::oneapi::property::queue::discard_events>
: std::true_type {};
template <>
struct is_property<ext::oneapi::property::queue::lazy_execution>
: std::true_type {};
template <>
struct is_property<property::queue::cuda::use_default_stream> : std::true_type {
};
template <>
Expand All @@ -80,6 +85,9 @@ template <>
struct is_property_of<ext::oneapi::property::queue::discard_events, queue>
: std::true_type {};
template <>
struct is_property_of<ext::oneapi::property::queue::lazy_execution, queue>
: std::true_type {};
template <>
struct is_property_of<property::queue::cuda::use_default_stream, queue>
: std::true_type {};
template <>
Expand Down
50 changes: 50 additions & 0 deletions sycl/include/sycl/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <sycl/property_list.hpp>
#include <sycl/stl.hpp>


// Explicitly request format macros
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS 1
Expand Down Expand Up @@ -1058,6 +1059,55 @@ class __SYCL_EXPORT queue {
// Clean KERNELFUNC macros.
#undef _KERNELFUNCPARAM

/// Shortcut for executing a graph of commands.
///
/// \param Graph the graph of commands to execute
/// \return an event representing graph execution operation.
event exec_graph(ext::oneapi::experimental::command_graph<
ext::oneapi::experimental::graph_state::executable>
Graph) {
const detail::code_location CodeLoc = {};
return submit([&](handler &CGH) { CGH.exec_graph(Graph); }, CodeLoc);
}

/// Shortcut for executing a graph of commands.
///
/// \param Graph the graph of commands to execute
/// \param DepEvent is an event that specifies the graph execution
/// dependencies.
/// \return an event representing graph execution operation.
event exec_graph(ext::oneapi::experimental::command_graph<
ext::oneapi::experimental::graph_state::executable>
Graph,
event DepEvent) {
const detail::code_location CodeLoc = {};
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvent);
CGH.exec_graph(Graph);
},
CodeLoc);
}

/// Shortcut for executing a graph of commands.
///
/// \param Graph the graph of commands to execute
/// \param DepEvents is a vector of events that specifies the graph
/// execution dependencies.
/// \return an event representing graph execution operation.
event exec_graph(ext::oneapi::experimental::command_graph<
ext::oneapi::experimental::graph_state::executable>
Graph,
const std::vector<event> &DepEvents) {
const detail::code_location CodeLoc = {};
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvents);
CGH.exec_graph(Graph);
},
CodeLoc);
}

/// Returns whether the queue is in order or OoO
///
/// Equivalent to has_property<property::queue::in_order>()
Expand Down
70 changes: 69 additions & 1 deletion sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1306,6 +1306,45 @@ pi_result resetCommandLists(pi_queue Queue) {
pi_result _pi_context::getAvailableCommandList(
pi_queue Queue, pi_command_list_ptr_t &CommandList, bool UseCopyEngine,
bool AllowBatching, ze_command_queue_handle_t *ForcedCmdQueue) {

// This is a hack. TODO: Proper CommandList allocation per Executable Graph.
if( Queue->Properties & PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION ) {
// TODO: Create new Command List.
if(Queue->LazyCommandListMap.empty()) {
const bool UseCopyEngine = false;
// Adding createCommandList() to LazyCommandListMap
ze_fence_handle_t ZeFence;
ZeStruct<ze_fence_desc_t> ZeFenceDesc;
ze_command_list_handle_t ZeCommandList;

uint32_t QueueGroupOrdinal;
auto &QGroup = Queue->getQueueGroup(UseCopyEngine);
auto &ZeCommandQueue =
//ForcedCmdQueue ? *ForcedCmdQueue :
QGroup.getZeQueue(&QueueGroupOrdinal);
//if (ForcedCmdQueue)
// QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue);

ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;

ZE_CALL(zeCommandListCreate, (Queue->Context->ZeContext, Queue->Device->ZeDevice,
&ZeCommandListDesc, &ZeCommandList));

ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
std::tie(CommandList, std::ignore) = Queue->LazyCommandListMap.insert(
std::pair<ze_command_list_handle_t, pi_command_list_info_t>(
ZeCommandList, {ZeFence, false, ZeCommandQueue, QueueGroupOrdinal}));

Queue->insertActiveBarriers(CommandList, UseCopyEngine);
//
CommandList->second.ZeFenceInUse = true;
} else {
CommandList = Queue->LazyCommandListMap.begin();
}
return PI_SUCCESS;
}

// Immediate commandlists have been pre-allocated and are always available.
if (Queue->Device->useImmediateCommandLists()) {
CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList();
Expand Down Expand Up @@ -1544,6 +1583,9 @@ void _pi_queue::CaptureIndirectAccesses() {
pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList,
bool IsBlocking,
bool OKToBatchCommand) {
// When executing a Graph, defer execution
if( this->Properties & PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION ) return PI_SUCCESS;

bool UseCopyEngine = CommandList->second.isCopy(this);

// If the current LastCommandEvent is the nullptr, then it means
Expand Down Expand Up @@ -3509,7 +3551,8 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
PI_ASSERT(!(Properties & ~(PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
PI_QUEUE_PROFILING_ENABLE | PI_QUEUE_ON_DEVICE |
PI_QUEUE_ON_DEVICE_DEFAULT |
PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS)),
PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS |
PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION)),
PI_ERROR_INVALID_VALUE);

PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
Expand Down Expand Up @@ -3783,6 +3826,31 @@ pi_result piQueueFinish(pi_queue Queue) {
// Flushing cross-queue dependencies is covered by createAndRetainPiZeEventList,
// so this can be left as a no-op.
pi_result piQueueFlush(pi_queue Queue) {
if( Queue->Properties & PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION ) {
pi_command_list_ptr_t CommandList{};
// TODO:
CommandList = Queue->LazyCommandListMap.begin();

auto &ZeCommandQueue = CommandList->second.ZeQueue;
// Scope of the lock must be till the end of the function, otherwise new mem
// allocs can be created between the moment when we made a snapshot and the
// moment when command list is closed and executed. But mutex is locked only
// if indirect access tracking enabled, because std::defer_lock is used.
// unique_lock destructor at the end of the function will unlock the mutex
// if it was locked (which happens only if IndirectAccessTrackingEnabled is
// true).
std::unique_lock<pi_shared_mutex> ContextsLock(
Queue->Device->Platform->ContextsMutex, std::defer_lock);

// Close the command list and have it ready for dispatch.
ZE_CALL(zeCommandListClose, (CommandList->first));

// Offload command list to the GPU for asynchronous execution
auto ZeCommandList = CommandList->first;
auto ZeResult = ZE_CALL_NOCHECK(
zeCommandQueueExecuteCommandLists,
(ZeCommandQueue, 1, &ZeCommandList, CommandList->second.ZeFence));
}
(void)Queue;
return PI_SUCCESS;
}
Expand Down
2 changes: 2 additions & 0 deletions sycl/plugins/level_zero/pi_level_zero.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -937,6 +937,8 @@ struct _pi_queue : _pi_object {

// Map of all command lists used in this queue.
pi_command_list_map_t CommandListMap;
// TODO: Assign Graph related command lists to command_graph object
pi_command_list_map_t LazyCommandListMap;

// Helper data structure to hold all variables related to batching
typedef struct CommandBatch {
Expand Down
1 change: 1 addition & 0 deletions sycl/source/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ set(SYCL_SOURCES
"detail/filter_selector_impl.cpp"
"detail/force_device.cpp"
"detail/global_handler.cpp"
"detail/graph_impl.cpp"
"detail/helpers.cpp"
"detail/handler_proxy.cpp"
"detail/image_accessor_util.cpp"
Expand Down
Loading