From 4b06a17c6729345a9cba5a9de0aa41343284a192 Mon Sep 17 00:00:00 2001 From: Sergey V Maslov Date: Tue, 12 Jul 2022 10:39:49 -0700 Subject: [PATCH 1/3] rebase Signed-off-by: Sergey V Maslov --- sycl/doc/EnvironmentVariables.md | 1 + sycl/plugins/level_zero/pi_level_zero.cpp | 88 ++++++++++++++++++----- sycl/plugins/level_zero/pi_level_zero.hpp | 8 +++ sycl/source/detail/plugin.hpp | 4 +- 4 files changed, 82 insertions(+), 19 deletions(-) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 13d98fa6be51e..4aefbbe11457a 100755 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -23,6 +23,7 @@ compiler and runtime. | `SYCL_ENABLE_DEFAULT_CONTEXTS` | '1' or '0' | Enable ('1') or disable ('0') creation of default platform contexts in SYCL runtime. The default context for each platform contains all devices in the platform. Refer to [Platform Default Contexts](extensions/supported/sycl_ext_oneapi_default_context.asciidoc) extension to learn more. Enabled by default on Linux and disabled on Windows. | | `SYCL_RT_WARNING_LEVEL` | Positive integer | The higher warning level is used the more warnings and performance hints the runtime library may print. Default value is '0', which means no warning/hint messages from the runtime library are allowed. The value '1' enables performance warnings from device runtime/codegen. The values greater than 1 are reserved for future use. | | `SYCL_USM_HOSTPTR_IMPORT` | Integer | Enable by specifying non-zero value. Buffers created with a host pointer will result in host data promotion to USM, improving data transfer performance. To use this feature, also set SYCL_HOST_UNIFIED_MEMORY=1. | +| `SYCL_EAGER_INIT` | Integer | Enable by specifying non-zero value. Tells the SYCL runtime to do as much as possible initialization at objects construction as opposed to doing lazy initialization on the fly. This may mean doing some redundant work at warmup but ensures fastest possible execution on the following hot and reportable paths. It also instructs PI plugins to do the same. Default is "0". | `(*) Note: Any means this environment variable is effective when set to any non-null value.` diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index ff59349bd0f82..6a46e9eecb9af 100755 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -215,6 +215,15 @@ static void zePrint(const char *Format, ...) { } } +// Controls if we should choose doing eager initialization +// to make it happen on warmup paths and have the reportable +// paths be less likely affected. +// +static bool doEagerInit = [] { + const char *EagerInit = std::getenv("SYCL_EAGER_INIT"); + return EagerInit ? std::atoi(EagerInit) != 0 : false; +}(); + // Controls whether device-scope events are used, and how. static const enum EventsScope { // All events are created host-visible. @@ -1230,7 +1239,7 @@ pi_result _pi_context::getAvailableCommandList( // Each command list is paired with an associated fence to track when the // command list is available for reuse. _pi_result pi_result = PI_ERROR_OUT_OF_RESOURCES; - ZeStruct ZeFenceDesc; + // Initally, we need to check if a command list has already been created // on this device that is available for use. If so, then reuse that // Level-Zero Command List and Fence for this PI call. @@ -1270,6 +1279,7 @@ pi_result _pi_context::getAvailableCommandList( QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue); ze_fence_handle_t ZeFence; + ZeStruct ZeFenceDesc; ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); CommandList = Queue->CommandListMap @@ -1310,15 +1320,27 @@ pi_result _pi_context::getAvailableCommandList( } } - // If there are no available command lists nor signalled command lists, then - // we must create another command list. - // Once created, this command list & fence are added to the command list fence - // map. - ze_command_list_handle_t ZeCommandList; + // If there are no available command lists nor signalled command lists, + // then we must create another command list. + pi_result = Queue->createCommandList(UseCopyEngine, CommandList); + CommandList->second.ZeFenceInUse = true; + return pi_result; +} + +// Helper function to create a new command-list to this queue and associated +// fence tracking its completion. This command list & fence are added to the +// map of command lists in this queue with ZeFenceInUse = false. +// The caller must hold a lock of the queue already. +pi_result _pi_queue::createCommandList(bool UseCopyEngine, + pi_command_list_ptr_t &CommandList, + ze_command_queue_handle_t *ForcedCmdQueue) { + ze_fence_handle_t ZeFence; + ZeStruct ZeFenceDesc; + ze_command_list_handle_t ZeCommandList; - auto &QGroup = Queue->getQueueGroup(UseCopyEngine); uint32_t QueueGroupOrdinal; + auto &QGroup = Queue->getQueueGroup(UseCopyEngine); auto &ZeCommandQueue = ForcedCmdQueue ? *ForcedCmdQueue : QGroup.getZeQueue(&QueueGroupOrdinal); if (ForcedCmdQueue) @@ -1327,19 +1349,16 @@ pi_result _pi_context::getAvailableCommandList( ZeStruct ZeCommandListDesc; ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; - ZE_CALL(zeCommandListCreate, - (Queue->Context->ZeContext, Queue->Device->ZeDevice, - &ZeCommandListDesc, &ZeCommandList)); + ZE_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice, + &ZeCommandListDesc, &ZeCommandList)); ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); - std::tie(CommandList, std::ignore) = Queue->CommandListMap.insert( + std::tie(CommandList, std::ignore) = CommandListMap.insert( std::pair( - ZeCommandList, {ZeFence, true, ZeCommandQueue, QueueGroupOrdinal})); - if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine)) - return Res; - pi_result = PI_SUCCESS; - - return pi_result; + ZeCommandList, {ZeFence, false, ZeCommandQueue, QueueGroupOrdinal})); + + PI_CALL(Queue->insertActiveBarriers(CommandList, UseCopyEngine)); + return PI_SUCCESS; } void _pi_queue::adjustBatchSizeForFullBatch(bool IsCopy) { @@ -3396,6 +3415,41 @@ pi_result piQueueCreate(pi_context Context, pi_device Device, } catch (...) { return PI_ERROR_UNKNOWN; } + + // Do eager initialization of Level Zero handles on request. + if (doEagerInit) { + pi_queue Q = *Queue; + // Creates said number of command-lists. + auto warmupQueueGroup = [Q](bool UseCopyEngine, + uint32_t RepeatCount) -> pi_result { + pi_command_list_ptr_t CommandList; + while (RepeatCount--) { + if (UseImmediateCommandLists) { + CommandList = Q->getQueueGroup(UseCopyEngine).getImmCmdList(); + } else { + // Heuristically create some number of regular command-list to reuse. + for (int I = 0; I < 10; ++I) { + PI_CALL(Q->createCommandList(UseCopyEngine, CommandList)); + // Immediately return them to the cache of available command-lists. + std::vector EventsUnused; + PI_CALL(Q->resetCommandList(CommandList, true /* MakeAvailable */, + EventsUnused)); + } + } + } + return PI_SUCCESS; + }; + // Create as many command-lists as there are queues in the group. + // With this the underlying round-robin logic would initialize all + // native queues, and create command-lists and their fences. + PI_CALL(warmupQueueGroup(false, Q->ComputeQueueGroup.UpperIndex - + Q->ComputeQueueGroup.LowerIndex + 1)); + if (Q->useCopyEngine()) { + PI_CALL(warmupQueueGroup(true, Q->CopyQueueGroup.UpperIndex - + Q->CopyQueueGroup.LowerIndex + 1)); + } + // TODO: warmup event pools. Both host-visible and device-only. + } return PI_SUCCESS; } diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 20d58c08d5380..83889e62d97d0 100755 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -954,6 +954,14 @@ struct _pi_queue : _pi_object { // For non-copy commands, IsCopy is set to 'false'. void adjustBatchSizeForPartialBatch(bool IsCopy); + // Helper function to create a new command-list to this queue and associated + // fence tracking its completion. This command list & fence are added to the + // map of command lists in this queue with ZeFenceInUse = false. + // The caller must hold a lock of the queue already. + pi_result createCommandList(bool UseCopyEngine, + pi_command_list_ptr_t &CommandList + ze_command_queue_handle_t *ForcedCmdQueue = nullptr); + // Resets the Command List and Associated fence in the ZeCommandListFenceMap. // If the reset command list should be made available, then MakeAvailable // needs to be set to true. The caller must verify that this command list and diff --git a/sycl/source/detail/plugin.hpp b/sycl/source/detail/plugin.hpp index ef67ec261bc6d..14e6273e02f2d 100644 --- a/sycl/source/detail/plugin.hpp +++ b/sycl/source/detail/plugin.hpp @@ -55,7 +55,7 @@ struct array_fill_helper { template struct array_fill_helper { - static void fill(unsigned char *Dst, const T &&Arg, Args &&... Rest) { + static void fill(unsigned char *Dst, const T &&Arg, Args &&...Rest) { using ArgsTuple = typename PiApiArgTuple::type; // C-style cast is required here. auto RealArg = (std::tuple_element_t)(Arg); @@ -71,7 +71,7 @@ constexpr size_t totalSize(const std::tuple &) { } template -auto packCallArguments(ArgsT &&... Args) { +auto packCallArguments(ArgsT &&...Args) { using ArgsTuple = typename PiApiArgTuple::type; constexpr size_t TotalSize = totalSize(ArgsTuple{}); From 037bb52b7f556f84f5e8e4a9a326ff1f35626ff6 Mon Sep 17 00:00:00 2001 From: Sergey V Maslov Date: Tue, 12 Jul 2022 10:47:26 -0700 Subject: [PATCH 2/3] clang-format Signed-off-by: Sergey V Maslov --- sycl/plugins/level_zero/pi_level_zero.cpp | 9 +++++---- sycl/plugins/level_zero/pi_level_zero.hpp | 7 ++++--- 2 files changed, 9 insertions(+), 7 deletions(-) mode change 100755 => 100644 sycl/plugins/level_zero/pi_level_zero.cpp mode change 100755 => 100644 sycl/plugins/level_zero/pi_level_zero.hpp diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp old mode 100755 new mode 100644 index 6a46e9eecb9af..853e061550b55 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1331,9 +1331,10 @@ pi_result _pi_context::getAvailableCommandList( // fence tracking its completion. This command list & fence are added to the // map of command lists in this queue with ZeFenceInUse = false. // The caller must hold a lock of the queue already. -pi_result _pi_queue::createCommandList(bool UseCopyEngine, - pi_command_list_ptr_t &CommandList, - ze_command_queue_handle_t *ForcedCmdQueue) { +pi_result +_pi_queue::createCommandList(bool UseCopyEngine, + pi_command_list_ptr_t &CommandList, + ze_command_queue_handle_t *ForcedCmdQueue) { ze_fence_handle_t ZeFence; ZeStruct ZeFenceDesc; @@ -1356,7 +1357,7 @@ pi_result _pi_queue::createCommandList(bool UseCopyEngine, std::tie(CommandList, std::ignore) = CommandListMap.insert( std::pair( ZeCommandList, {ZeFence, false, ZeCommandQueue, QueueGroupOrdinal})); - + PI_CALL(Queue->insertActiveBarriers(CommandList, UseCopyEngine)); return PI_SUCCESS; } diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp old mode 100755 new mode 100644 index 83889e62d97d0..b456c9f07c899 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -958,9 +958,10 @@ struct _pi_queue : _pi_object { // fence tracking its completion. This command list & fence are added to the // map of command lists in this queue with ZeFenceInUse = false. // The caller must hold a lock of the queue already. - pi_result createCommandList(bool UseCopyEngine, - pi_command_list_ptr_t &CommandList - ze_command_queue_handle_t *ForcedCmdQueue = nullptr); + pi_result + createCommandList(bool UseCopyEngine, + pi_command_list_ptr_t &CommandList + ze_command_queue_handle_t *ForcedCmdQueue = nullptr); // Resets the Command List and Associated fence in the ZeCommandListFenceMap. // If the reset command list should be made available, then MakeAvailable From e104936a732f6694f71b40e8de0c8d62a24b74d4 Mon Sep 17 00:00:00 2001 From: Sergey V Maslov Date: Tue, 12 Jul 2022 21:52:44 -0700 Subject: [PATCH 3/3] fix build Signed-off-by: Sergey V Maslov --- sycl/plugins/level_zero/pi_level_zero.cpp | 4 ++-- sycl/plugins/level_zero/pi_level_zero.hpp | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) mode change 100644 => 100755 sycl/plugins/level_zero/pi_level_zero.cpp diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp old mode 100644 new mode 100755 index 853e061550b55..60cff281370ea --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1341,7 +1341,7 @@ _pi_queue::createCommandList(bool UseCopyEngine, ze_command_list_handle_t ZeCommandList; uint32_t QueueGroupOrdinal; - auto &QGroup = Queue->getQueueGroup(UseCopyEngine); + auto &QGroup = getQueueGroup(UseCopyEngine); auto &ZeCommandQueue = ForcedCmdQueue ? *ForcedCmdQueue : QGroup.getZeQueue(&QueueGroupOrdinal); if (ForcedCmdQueue) @@ -1358,7 +1358,7 @@ _pi_queue::createCommandList(bool UseCopyEngine, std::pair( ZeCommandList, {ZeFence, false, ZeCommandQueue, QueueGroupOrdinal})); - PI_CALL(Queue->insertActiveBarriers(CommandList, UseCopyEngine)); + PI_CALL(insertActiveBarriers(CommandList, UseCopyEngine)); return PI_SUCCESS; } diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index b456c9f07c899..180322904b8a5 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -959,9 +959,8 @@ struct _pi_queue : _pi_object { // map of command lists in this queue with ZeFenceInUse = false. // The caller must hold a lock of the queue already. pi_result - createCommandList(bool UseCopyEngine, - pi_command_list_ptr_t &CommandList - ze_command_queue_handle_t *ForcedCmdQueue = nullptr); + createCommandList(bool UseCopyEngine, pi_command_list_ptr_t &CommandList, + ze_command_queue_handle_t *ForcedCmdQueue = nullptr); // Resets the Command List and Associated fence in the ZeCommandListFenceMap. // If the reset command list should be made available, then MakeAvailable