diff --git a/sycl/doc/extensions/supported/sycl_ext_intel_queue_immediate_command_list.asciidoc b/sycl/doc/extensions/supported/sycl_ext_intel_queue_immediate_command_list.asciidoc new file mode 100755 index 0000000000000..4c929de9cf55d --- /dev/null +++ b/sycl/doc/extensions/supported/sycl_ext_intel_queue_immediate_command_list.asciidoc @@ -0,0 +1,157 @@ += sycl_ext_intel_queue_immediate_command_list + +:source-highlighter: coderay +:coderay-linenums-mode: table + +// This section needs to be after the document title. +:doctype: book +:toc2: +:toc: left +:encoding: utf-8 +:lang: en +:dpcpp: pass:[DPC++] + +// Set the default source code type in this document to C++, +// for syntax highlighting purposes. This is needed because +// docbook uses c++ and html5 uses cpp. +:language: {basebackend@docbook:c++:cpp} + + +== Notice + +[%hardbreaks] +Copyright (C) 2023-2023 Intel Corporation. All rights reserved. + +Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks +of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by +permission by Khronos. + + +== Contact + +To report problems with this extension, please open a new issue at: + +https://github.com/intel/llvm/issues + + +== Dependencies + +This extension is written against the SYCL 2020 revision 7 specification. All +references below to the "core SYCL specification" or to section numbers in the +SYCL specification refer to that revision. + + +== Status + +This extension is implemented and fully supported by {dpcpp}. + +However, the immediate command list feature (which is exposed by this extension) +has been well-tested only on Intel (R) Data Center Max Series GPUs (aka PVC). +Use of this extension to specify immediate command lists is not recommended +for other Intel GPUs. + + +== Backend support status + +The properties added by this extension are a hint, which all backends accept. +However, in the current {dpcpp} implementation, the hint is only meaningful +on the Level Zero backend. + +== Overview + +When commands are submitted to a SYCL queue that uses the Level Zero backend, +those commands can be submitted to the hardware in one of two ways: +either through an immediate command list or through a standard command queue. +Commands submitted through an immediate command list are immediately submitted +to the device while commands submitted through a standard command queue may be +batched with other commands before they are submitted. By default the +implementation chooses a method that works best for most workloads. + +In most cases, applications should rely on the default behavior. +However, sometimes it is advantageous for the application to choose one method +or the other. This extension provides a way for applications to select either +of these two methods via a queue property. + +For example, when kernel runtimes are very short, the submission time on the +host may be as long or longer than the actual runtime of the kernel. In this +case, doing batched submissions may be preferable so that the submission +overhead is amortized over a number of kernel executions. + + +== Specification + +=== Feature test macro + +This extension provides a feature-test macro as described in the core SYCL +specification. An implementation supporting this extension must predefine the +macro `SYCL_EXT_INTEL_QUEUE_IMMEDIATE_COMMAND_LIST` to one of the values +defined in the table below. Applications can test for the existence of this +macro to determine if the implementation supports this feature, or +applications can test the macro's value to determine which of the +extension's features the implementation supports. + +[%header,cols="1,5"] +|=== +|Value +|Description + +|1 +|Initial version of this extension. +|=== + +=== New queue properties +This extension adds the following new properties that can be used when +constructing a queue object. + +```c++ +namespace sycl::ext::intel::property::queue { + +struct immediate_command_list {}; +struct no_immediate_command_list {}; + +} // namespace sycl::ext::intel::property::queue + +``` + + +Both properties are hints, which are ignored unless the backend is Level Zero. + +The property `immediate_command_list` requests that the implementation use an +immediate command list when commands are submitted to this queue. As a result, +these commands are submitted immediately to the device. + +The property `no_immediate_command_list` requests that the implementation use +a standard command queue instead of an immediate command list. As a result, +commands submitted to this queue may be batched with other commands before +being submitted to the device. + +These two properties are mutually exclusive. Constructing a queue with both +properties causes the constructor to throw a synchronous exception with +the `errc::invalid` error code. + +== Example +```c++ +#include + +namespace syclintel = sycl::ext::intel; + +int main() { + // Use immediate command lists + sycl::queue q1{syclintel::property::queue::immediate_command_list{}}; + ... + + // Do not use immediate command lists + sycl::queue q2{syclintel::property::queue::no_immediate_command_list{}}; + ... +} +``` + +== Interaction with the SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS environment variable + +{dpcpp} supports an environment variable named +SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS which also controls +the use of immediate command lists in SYCL queues. When that +environment variable is used in conjunction with the properties in this +extension, the properties take precedence. The environment variable has +no effect on queues constructed with one of these properties, however it +still affects queues that were not constructed with either of these properties. diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index c029bf3fa0391..08c3f24ef6bf5 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -662,6 +662,8 @@ constexpr pi_queue_properties PI_QUEUE_FLAG_ON_DEVICE_DEFAULT = (1 << 3); constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS = (1 << 4); constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW = (1 << 5); constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH = (1 << 6); +constexpr pi_queue_properties PI_EXT_QUEUE_FLAG_SUBMISSION_NO_IMMEDIATE = (1 << 7); +constexpr pi_queue_properties PI_EXT_QUEUE_FLAG_SUBMISSION_IMMEDIATE = (1 << 8); // clang-format on typedef enum { diff --git a/sycl/include/sycl/detail/property_helper.hpp b/sycl/include/sycl/detail/property_helper.hpp index 93eebdcb81ffb..1ecd9b15e1773 100644 --- a/sycl/include/sycl/detail/property_helper.hpp +++ b/sycl/include/sycl/detail/property_helper.hpp @@ -44,8 +44,10 @@ enum DataLessPropKind { QueuePriorityLow = 17, QueuePriorityHigh = 18, GraphNoCycleCheck = 19, + QueueSubmissionBatched = 20, + QueueSubmissionImmediate = 21, // Indicates the last known dataless property. - LastKnownDataLessPropKind = 19, + LastKnownDataLessPropKind = 21, // Exceeding 32 may cause ABI breaking change on some of OSes. DataLessPropKindSize = 32 }; diff --git a/sycl/include/sycl/properties/queue_properties.def b/sycl/include/sycl/properties/queue_properties.def index a888c0ffe44aa..6e0f3fd700952 100644 --- a/sycl/include/sycl/properties/queue_properties.def +++ b/sycl/include/sycl/properties/queue_properties.def @@ -17,6 +17,10 @@ __SYCL_DATA_LESS_PROP(ext::oneapi::property::queue, priority_low, QueuePriorityLow) __SYCL_DATA_LESS_PROP(ext::oneapi::property::queue, priority_high, QueuePriorityHigh) +__SYCL_DATA_LESS_PROP(ext::intel::property::queue, no_immediate_command_list, + QueueSubmissionBatched) +__SYCL_DATA_LESS_PROP(ext::intel::property::queue, immediate_command_list, + QueueSubmissionImmediate) __SYCL_DATA_LESS_PROP(ext::oneapi::cuda::property::queue, use_default_stream, UseDefaultStream) diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 1c86b1b075f5a..c213e5f3c1702 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -1455,7 +1455,9 @@ inline pi_result piextQueueCreate(pi_context Context, pi_device Device, PI_QUEUE_FLAG_ON_DEVICE_DEFAULT | PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS | PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW | - PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH)), + PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH | + PI_EXT_QUEUE_FLAG_SUBMISSION_NO_IMMEDIATE | + PI_EXT_QUEUE_FLAG_SUBMISSION_IMMEDIATE)), PI_ERROR_INVALID_VALUE); PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); @@ -1482,6 +1484,10 @@ inline pi_result piextQueueCreate(pi_context Context, pi_device Device, UrProperties.flags |= UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM; if (Properties[1] & __SYCL_PI_CUDA_USE_DEFAULT_STREAM) UrProperties.flags |= UR_QUEUE_FLAG_USE_DEFAULT_STREAM; + if (Properties[1] & PI_EXT_QUEUE_FLAG_SUBMISSION_NO_IMMEDIATE) + UrProperties.flags |= UR_QUEUE_FLAG_SUBMISSION_BATCHED; + if (Properties[1] & PI_EXT_QUEUE_FLAG_SUBMISSION_IMMEDIATE) + UrProperties.flags |= UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE; ur_queue_index_properties_t IndexProperties{}; IndexProperties.stype = UR_STRUCTURE_TYPE_QUEUE_INDEX_PROPERTIES; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/context.cpp index 452980afd8632..0aa279f08c76d 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/context.cpp @@ -583,7 +583,7 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList( bool UseCopyEngine, bool AllowBatching, ze_command_queue_handle_t *ForcedCmdQueue) { // Immediate commandlists have been pre-allocated and are always available. - if (Queue->Device->ImmCommandListUsed) { + if (Queue->UsingImmCmdLists) { CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList(); if (CommandList->second.EventList.size() > ImmCmdListsEventCleanupThreshold) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/device.cpp index 9b3113647e6f4..18b1e3b4a3ee1 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/device.cpp @@ -971,10 +971,6 @@ ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, ImmCommandListUsed = this->useImmediateCommandLists(); - if (ImmCommandListUsed == ImmCmdlistMode::NotUsed) { - ZeEventsScope = DeviceEventsSetting; - } - uint32_t numQueueGroups = 0; ZE2UR_CALL(zeDeviceGetCommandQueueGroupProperties, (ZeDevice, &numQueueGroups, nullptr)); diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/device.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/device.hpp index 7edb43ab96ddf..c47613c720110 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/device.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/device.hpp @@ -135,11 +135,6 @@ struct ur_device_handle_t_ : _ur_object { // Returns whether immediate command lists are used on this device. ImmCmdlistMode ImmCommandListUsed{}; - // Scope of events used for events on the device - // Can be adjusted with UR_L0_DEVICE_SCOPE_EVENTS - // for non-immediate command lists - EventsScope ZeEventsScope = AllHostVisible; - bool isSubDevice() { return RootDevice != nullptr; } // Is this a Data Center GPU Max series (aka PVC)? diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/event.cpp index 807afa078b589..57b634368ebfc 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/event.cpp @@ -115,7 +115,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( } } - if (!Queue->Device->ImmCommandListUsed) { + if (!Queue->UsingImmCmdLists) { std::unique_lock Lock(Queue->Mutex); resetCommandLists(Queue); } @@ -270,7 +270,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( for (auto &QueueGroup : QueueMap) { bool UseCopyEngine = QueueGroup.second.Type != ur_queue_handle_t_::queue_type::Compute; - if (Queue->Device->ImmCommandListUsed) { + if (Queue->UsingImmCmdLists) { // If immediate command lists are being used, each will act as their own // queue, so we must insert a barrier into each. for (auto &ImmCmdList : QueueGroup.second.ImmCmdLists) @@ -498,7 +498,7 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent( this->Mutex); if (!HostVisibleEvent) { - if (UrQueue->Device->ZeEventsScope != OnDemandHostVisibleProxy) + if (UrQueue->ZeEventsScope != OnDemandHostVisibleProxy) die("getOrCreateHostVisibleEvent: missing host-visible event"); // Submit the command(s) signalling the proxy event to the queue. @@ -538,8 +538,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventWait( ///< events to wait for completion ) { for (uint32_t I = 0; I < NumEvents; I++) { - if (EventWaitList[I]->UrQueue->Device->ZeEventsScope == - OnDemandHostVisibleProxy) { + if (EventWaitList[I]->UrQueue->ZeEventsScope == OnDemandHostVisibleProxy) { // Make sure to add all host-visible "proxy" event signals if needed. // This ensures that all signalling commands are submitted below and // thus proxy events can be waited without a deadlock. @@ -587,7 +586,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventWait( } } if (auto Q = Event->UrQueue) { - if (Q->Device->ImmCommandListUsed && Q->isInOrderQueue()) + if (Q->UsingImmCmdLists && Q->isInOrderQueue()) // Use information about waited event to cleanup completed events in // the in-order queue. CleanupEventsInImmCmdLists( @@ -1029,7 +1028,7 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( this->UrEventList = nullptr; if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) { - if (CurQueue->Device->ImmCommandListUsed) { + if (CurQueue->UsingImmCmdLists) { if (ReuseDiscardedEvents && CurQueue->isDiscardEvents()) { // If queue is in-order with discarded events and if // new command list is different from the last used command list then @@ -1158,7 +1157,7 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( // // Make sure that event1.wait() will wait for a host-visible // event that is signalled before the command2 is enqueued. - if (CurQueue->Device->ZeEventsScope != AllHostVisible) { + if (CurQueue->ZeEventsScope != AllHostVisible) { CurQueue->executeAllOpenCommandLists(); } } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/kernel.cpp index bc2576fa0629d..063946bc4936d 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/kernel.cpp @@ -209,7 +209,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( if (IndirectAccessTrackingEnabled) Queue->KernelsToBeSubmitted.push_back(Kernel); - if (Queue->Device->ImmCommandListUsed && IndirectAccessTrackingEnabled) { + if (Queue->UsingImmCmdLists && IndirectAccessTrackingEnabled) { // If using immediate commandlists then gathering of indirect // references and appending to the queue (which means submission) // must be done together. diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/queue.cpp index ad5ba4f645ba7..adc9a993a81d0 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/queue.cpp @@ -29,7 +29,7 @@ ur_result_t CleanupEventsInImmCmdLists(ur_queue_handle_t UrQueue, bool QueueLocked, bool QueueSynced, ur_event_handle_t CompletedEvent) { // Handle only immediate command lists here. - if (!UrQueue || !UrQueue->Device->ImmCommandListUsed) + if (!UrQueue || !UrQueue->UsingImmCmdLists) return UR_RESULT_SUCCESS; ur_event_handle_t_ *UrCompletedEvent = @@ -102,7 +102,7 @@ ur_result_t CleanupEventsInImmCmdLists(ur_queue_handle_t UrQueue, ur_result_t resetCommandLists(ur_queue_handle_t Queue) { // Handle immediate command lists here, they don't need to be reset and we // only need to cleanup events. - if (Queue->Device->ImmCommandListUsed) { + if (Queue->UsingImmCmdLists) { UR_CALL(CleanupEventsInImmCmdLists(Queue, true /*locked*/)); return UR_RESULT_SUCCESS; } @@ -192,7 +192,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo( // because immediate command lists are not associated with level zero // queue. Conservatively return false in this case because last event is // discarded and we can't check its status. - if (Queue->Device->ImmCommandListUsed) + if (Queue->UsingImmCmdLists) return ReturnValue(false); } @@ -207,7 +207,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo( for (const auto &QueueMap : {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID}) { for (const auto &QueueGroup : QueueMap) { - if (Queue->Device->ImmCommandListUsed) { + if (Queue->UsingImmCmdLists) { // Immediate command lists are not associated with any Level Zero // queue, that's why we have to check status of events in each // immediate command list. Start checking from the end and exit early @@ -342,7 +342,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( uint32_t RepeatCount) -> ur_result_t { ur_command_list_ptr_t CommandList; while (RepeatCount--) { - if (Q->Device->ImmCommandListUsed) { + if (Q->UsingImmCmdLists) { CommandList = Q->getQueueGroup(UseCopyEngine).getImmCmdList(); } else { // Heuristically create some number of regular command-list to reuse. @@ -620,7 +620,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish( ur_queue_handle_t UrQueue ///< [in] handle of the queue to be finished. ) { - if (UrQueue->Device->ImmCommandListUsed) { + if (UrQueue->UsingImmCmdLists) { // Lock automatically releases when this goes out of scope. std::scoped_lock Lock(UrQueue->Mutex); @@ -677,7 +677,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish( // Reset signalled command lists and return them back to the cache of // available command lists. Events in the immediate command lists are cleaned // up in synchronize(). - if (!UrQueue->Device->ImmCommandListUsed) { + if (!UrQueue->UsingImmCmdLists) { std::unique_lock Lock(UrQueue->Mutex); resetCommandLists(UrQueue); } @@ -854,8 +854,21 @@ ur_queue_handle_t_::ur_queue_handle_t_( bool OwnZeCommandQueue, ur_queue_flags_t Properties, int ForceComputeIndex) : Context{Context}, Device{Device}, OwnZeCommandQueue{OwnZeCommandQueue}, Properties(Properties) { - // Set the type of commandlists the queue will use. - UsingImmCmdLists = Device->useImmediateCommandLists(); + // Set the type of commandlists the queue will use when user-selected + // submission mode. Otherwise use env var setting and if unset, use default. + if (isBatchedSubmission()) + UsingImmCmdLists = false; + else if (isImmediateSubmission()) + UsingImmCmdLists = true; + else + UsingImmCmdLists = Device->useImmediateCommandLists(); + + // Set events scope for this queue. Non-immediate can be controlled by env + // var. Immediate always uses AllHostVisible. + if (!UsingImmCmdLists) { + ZeEventsScope = DeviceEventsSetting; + } + // Compute group initialization. // First, see if the queue's device allows for round-robin or it is // fixed to one particular compute CCS (it is so for sub-sub-devices). @@ -925,7 +938,7 @@ ur_queue_handle_t_::ur_queue_handle_t_( CopyQueueGroup.NextIndex = CopyQueueGroup.LowerIndex; // Create space to hold immediate commandlists corresponding to the // ZeQueues - if (Device->ImmCommandListUsed) { + if (UsingImmCmdLists) { CopyQueueGroup.ImmCmdLists = std::vector( CopyQueueGroup.ZeQueues.size(), CommandListMap.end()); } @@ -1033,7 +1046,7 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList, this->LastUsedCommandList = CommandList; - if (!Device->ImmCommandListUsed) { + if (!UsingImmCmdLists) { // Batch if allowed to, but don't batch if we know there are no kernels // from this queue that are currently executing. This is intended to get // kernels started as soon as possible when there are no kernels from this @@ -1086,7 +1099,7 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList, CaptureIndirectAccesses(); } - if (!Device->ImmCommandListUsed) { + if (!UsingImmCmdLists) { // In this mode all inner-batch events have device visibility only, // and we want the last command in the batch to signal a host-visible // event that anybody waiting for any event in the batch will @@ -1095,7 +1108,7 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList, // in the command list is not empty, otherwise we are going to just create // and remove proxy event right away and dereference deleted object // afterwards. - if (Device->ZeEventsScope == LastCommandInBatchHostVisible && + if (ZeEventsScope == LastCommandInBatchHostVisible && !CommandList->second.EventList.empty()) { // If there are only internal events in the command list then we don't // need to create host proxy event. @@ -1197,7 +1210,7 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList, // Check global control to make every command blocking for debugging. if (IsBlocking || (UrL0Serialize & UrL0SerializeBlock) != 0) { - if (Device->ImmCommandListUsed) { + if (UsingImmCmdLists) { synchronize(); } else { // Wait until command lists attached to the command queue are executed. @@ -1315,6 +1328,14 @@ bool ur_queue_handle_t_::isPriorityHigh() const { return ((this->Properties & UR_QUEUE_FLAG_PRIORITY_HIGH) != 0); } +bool ur_queue_handle_t_::isBatchedSubmission() const { + return ((this->Properties & UR_QUEUE_FLAG_SUBMISSION_BATCHED) != 0); +} + +bool ur_queue_handle_t_::isImmediateSubmission() const { + return ((this->Properties & UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE) != 0); +} + bool ur_queue_handle_t_::isInOrderQueue() const { // If out-of-order queue property is not set, then this is a in-order queue. return ((this->Properties & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) == @@ -1389,7 +1410,7 @@ ur_result_t ur_queue_handle_t_::synchronize() { // so they can be reused later for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID}) { for (auto &QueueGroup : QueueMap) { - if (Device->ImmCommandListUsed) { + if (UsingImmCmdLists) { for (auto &ImmCmdList : QueueGroup.second.ImmCmdLists) { if (ImmCmdList == this->CommandListMap.end()) continue; @@ -1405,7 +1426,7 @@ ur_result_t ur_queue_handle_t_::synchronize() { // Otherwise sync all L0 queues/immediate command-lists. for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID}) { for (auto &QueueGroup : QueueMap) { - if (Device->ImmCommandListUsed) { + if (UsingImmCmdLists) { for (auto &ImmCmdList : QueueGroup.second.ImmCmdLists) syncImmCmdList(this, ImmCmdList); } else { @@ -1464,8 +1485,7 @@ ur_result_t createEventAndAssociateQueue(ur_queue_handle_t Queue, if (!HostVisible.has_value()) { // Internal/discarded events do not need host-scope visibility. - HostVisible = - IsInternal ? false : Queue->Device->ZeEventsScope == AllHostVisible; + HostVisible = IsInternal ? false : Queue->ZeEventsScope == AllHostVisible; } // If event is discarded then try to get event from the queue cache. @@ -1679,7 +1699,7 @@ ur_command_list_ptr_t ur_queue_handle_t_::eventOpenCommandList(ur_event_handle_t Event) { using IsCopy = bool; - if (Device->ImmCommandListUsed) { + if (UsingImmCmdLists) { // When using immediate commandlists there are no open command lists. return CommandListMap.end(); } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/queue.hpp index 692b1fd4a23ba..5485bfd173e61 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/queue.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/queue.hpp @@ -212,6 +212,11 @@ struct ur_queue_handle_t_ : _ur_object { // constructed, the caller chooses the type of commandlists to use. bool UsingImmCmdLists = false; + // Scope of events used for events on the queue + // Can be adjusted with UR_L0_DEVICE_SCOPE_EVENTS + // for non-immediate command lists + EventsScope ZeEventsScope = AllHostVisible; + // Keeps track of the event associated with the last enqueued command into // this queue. this is used to add dependency with the last command to add // in-order semantics and updated with the latest event each time a new @@ -394,6 +399,10 @@ struct ur_queue_handle_t_ : _ur_object { bool isPriorityLow() const; bool isPriorityHigh() const; + // Returns true if the queue has an explicitly selected submission mode. + bool isBatchedSubmission() const; + bool isImmediateSubmission() const; + // Wait for all commandlists associated with this Queue to finish operations. ur_result_t synchronize(); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index bfbf42b775542..fb9c7e455dc46 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -483,6 +483,23 @@ class queue_impl { } CreationFlags |= PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH; } + // Track that submission modes do not conflict. + bool SubmissionSeen = false; + if (PropList.has_property< + ext::intel::property::queue::no_immediate_command_list>()) { + SubmissionSeen = true; + CreationFlags |= PI_EXT_QUEUE_FLAG_SUBMISSION_NO_IMMEDIATE; + } + if (PropList.has_property< + ext::intel::property::queue::immediate_command_list>()) { + if (SubmissionSeen) { + throw sycl::exception( + make_error_code(errc::invalid), + "Queue cannot be constructed with different submission modes."); + } + SubmissionSeen = true; + CreationFlags |= PI_EXT_QUEUE_FLAG_SUBMISSION_IMMEDIATE; + } return CreationFlags; } diff --git a/sycl/source/feature_test.hpp.in b/sycl/source/feature_test.hpp.in index c3766cd57beeb..6752728771626 100644 --- a/sycl/source/feature_test.hpp.in +++ b/sycl/source/feature_test.hpp.in @@ -83,6 +83,7 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) { #define SYCL_EXT_INTEL_CACHE_CONFIG 1 #define SYCL_EXT_CODEPLAY_MAX_REGISTERS_PER_WORK_GROUP_QUERY 1 #define SYCL_EXT_ONEAPI_DEVICE_GLOBAL 1 +#define SYCL_EXT_INTEL_QUEUE_IMMEDIATE_COMMAND_LIST 1 #ifndef __has_include #define __has_include(x) 0 diff --git a/sycl/test-e2e/Plugin/queue_submit_mode.cpp b/sycl/test-e2e/Plugin/queue_submit_mode.cpp new file mode 100755 index 0000000000000..f89854d7df2ce --- /dev/null +++ b/sycl/test-e2e/Plugin/queue_submit_mode.cpp @@ -0,0 +1,39 @@ +// REQUIRES: gpu, level_zero +// RUN: %{build} %level_zero_options -o %t.out +// RUN: env ZE_DEBUG=4 SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{run} %t.out 0 2>&1 | FileCheck %s --check-prefixes=CHECK-STD +// RUN: env ZE_DEBUG=4 SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{run} %t.out 1 2>&1 | FileCheck %s --check-prefixes=CHECK-IMM +// +// Check that queue submission mode is honored when creating queue. +// +#include +#include + +using namespace sycl; + +void queue_submit(queue &Q) { + Q.submit([&](handler &cgh) { + cgh.single_task([=]() { + // [kernel code] + }); + }).wait(); +} + +// Command argument is 0 / 1 to select standard / immediate command lists. +int main(int argc, char *argv[]) { + bool Immediate = false; + if (argc > 1) { + Immediate = std::stoi(argv[1]) != 0; + } + property_list P; + if (Immediate) + P = ext::intel::property::queue::immediate_command_list(); + else + P = ext::intel::property::queue::no_immediate_command_list(); + + // CHECK-STD: zeCommandListCreateImmediate = 1 + // CHECK-IMM: zeCommandListCreateImmediate = 2 + queue Q1{P}; + queue_submit(Q1); + + return 0; +} diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump index daeefdedb3066..2efc1eaa6fb3d 100644 --- a/sycl/test/abi/pi_level_zero_symbol_check.dump +++ b/sycl/test/abi/pi_level_zero_symbol_check.dump @@ -101,6 +101,8 @@ piextContextSetExtendedDeleter piextDeviceCreateWithNativeHandle piextDeviceGetNativeHandle piextDeviceSelectBinary +piextDisablePeerAccess +piextEnablePeerAccess piextEnqueueCommandBuffer piextEnqueueReadHostPipe piextEnqueueWriteHostPipe @@ -115,6 +117,7 @@ piextKernelSetArgSampler piextMemCreateWithNativeHandle piextMemGetNativeHandle piextMemImageCreateWithNativeHandle +piextPeerAccessGetInfo piextPlatformCreateWithNativeHandle piextPlatformGetNativeHandle piextPluginGetOpaqueData @@ -136,6 +139,3 @@ piextUSMFree piextUSMGetMemAllocInfo piextUSMHostAlloc piextUSMSharedAlloc -piextEnablePeerAccess -piextDisablePeerAccess -piextPeerAccessGetInfo diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 5b36f93430d31..8c291e1786d40 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3948,6 +3948,9 @@ _ZN4sycl3_V16detail9join_implERKSt6vectorISt10shared_ptrINS1_18kernel_bundle_imp _ZN4sycl3_V16detail9link_implERKSt6vectorINS0_13kernel_bundleILNS0_12bundle_stateE1EEESaIS5_EERKS2_INS0_6deviceESaISA_EERKNS0_13property_listE _ZN4sycl3_V16device11get_devicesENS0_4info11device_typeE _ZN4sycl3_V16device26ext_oneapi_architecture_isENS0_3ext6oneapi12experimental12architectureE +_ZN4sycl3_V16device26ext_oneapi_can_access_peerERKS1_NS0_3ext6oneapi11peer_accessE +_ZN4sycl3_V16device29ext_oneapi_enable_peer_accessERKS1_ +_ZN4sycl3_V16device30ext_oneapi_disable_peer_accessERKS1_ _ZN4sycl3_V16deviceC1EP13_cl_device_id _ZN4sycl3_V16deviceC1ERKNS0_15device_selectorE _ZN4sycl3_V16deviceC1Ev @@ -4120,6 +4123,8 @@ _ZNK4sycl3_V15queue11get_backendEv _ZNK4sycl3_V15queue11get_contextEv _ZNK4sycl3_V15queue11is_in_orderEv _ZNK4sycl3_V15queue12get_propertyINS0_3ext5intel8property5queue13compute_indexEEET_v +_ZNK4sycl3_V15queue12get_propertyINS0_3ext5intel8property5queue22immediate_command_listEEET_v +_ZNK4sycl3_V15queue12get_propertyINS0_3ext5intel8property5queue25no_immediate_command_listEEET_v _ZNK4sycl3_V15queue12get_propertyINS0_3ext6oneapi4cuda8property5queue18use_default_streamEEET_v _ZNK4sycl3_V15queue12get_propertyINS0_3ext6oneapi8property5queue12priority_lowEEET_v _ZNK4sycl3_V15queue12get_propertyINS0_3ext6oneapi8property5queue13priority_highEEET_v @@ -4129,6 +4134,8 @@ _ZNK4sycl3_V15queue12get_propertyINS0_8property5queue16enable_profilingEEET_v _ZNK4sycl3_V15queue12get_propertyINS0_8property5queue4cuda18use_default_streamEEET_v _ZNK4sycl3_V15queue12get_propertyINS0_8property5queue8in_orderEEET_v _ZNK4sycl3_V15queue12has_propertyINS0_3ext5intel8property5queue13compute_indexEEEbv +_ZNK4sycl3_V15queue12has_propertyINS0_3ext5intel8property5queue22immediate_command_listEEEbv +_ZNK4sycl3_V15queue12has_propertyINS0_3ext5intel8property5queue25no_immediate_command_listEEEbv _ZNK4sycl3_V15queue12has_propertyINS0_3ext6oneapi4cuda8property5queue18use_default_streamEEEbv _ZNK4sycl3_V15queue12has_propertyINS0_3ext6oneapi8property5queue12priority_lowEEEbv _ZNK4sycl3_V15queue12has_propertyINS0_3ext6oneapi8property5queue13priority_highEEEbv @@ -4456,9 +4463,6 @@ _ZNK4sycl3_V16device8get_infoINS0_4info6device8atomic64EEENS0_6detail19is_device _ZNK4sycl3_V16device8get_infoINS0_4info6device8platformEEENS0_6detail19is_device_info_descIT_E11return_typeEv _ZNK4sycl3_V16device8get_infoINS0_4info6device9vendor_idEEENS0_6detail19is_device_info_descIT_E11return_typeEv _ZNK4sycl3_V16device9getNativeEv -_ZN4sycl3_V16device26ext_oneapi_can_access_peerERKS1_NS0_3ext6oneapi11peer_accessE -_ZN4sycl3_V16device29ext_oneapi_enable_peer_accessERKS1_ -_ZN4sycl3_V16device30ext_oneapi_disable_peer_accessERKS1_ _ZNK4sycl3_V16kernel11get_backendEv _ZNK4sycl3_V16kernel11get_contextEv _ZNK4sycl3_V16kernel13getNativeImplEv diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 3e4d2f843dc95..74cafa3beef4d 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -219,6 +219,7 @@ ??$get_property@Vcontext_bound@image@property@_V1@sycl@@@stream@_V1@sycl@@QEBA?AVcontext_bound@image@property@12@XZ ??$get_property@Vdiscard_events@queue@property@oneapi@ext@_V1@sycl@@@queue@_V1@sycl@@QEBA?AVdiscard_events@0property@oneapi@ext@12@XZ ??$get_property@Venable_profiling@queue@property@_V1@sycl@@@queue@_V1@sycl@@QEBA?AVenable_profiling@0property@12@XZ +??$get_property@Vimmediate_command_list@queue@property@intel@ext@_V1@sycl@@@queue@_V1@sycl@@QEBA?AVimmediate_command_list@0property@intel@ext@12@XZ ??$get_property@Vin_order@queue@property@_V1@sycl@@@buffer_plain@detail@_V1@sycl@@IEBA?AVin_order@queue@property@23@XZ ??$get_property@Vin_order@queue@property@_V1@sycl@@@context@_V1@sycl@@QEBA?AVin_order@queue@property@12@XZ ??$get_property@Vin_order@queue@property@_V1@sycl@@@image_plain@detail@_V1@sycl@@IEBA?AVin_order@queue@property@23@XZ @@ -235,6 +236,7 @@ ??$get_property@Vmem_channel@buffer@property@_V1@sycl@@@image_plain@detail@_V1@sycl@@IEBA?AVmem_channel@buffer@property@23@XZ ??$get_property@Vmem_channel@buffer@property@_V1@sycl@@@sampler@_V1@sycl@@QEBA?AVmem_channel@buffer@property@12@XZ ??$get_property@Vmem_channel@buffer@property@_V1@sycl@@@stream@_V1@sycl@@QEBA?AVmem_channel@buffer@property@12@XZ +??$get_property@Vno_immediate_command_list@queue@property@intel@ext@_V1@sycl@@@queue@_V1@sycl@@QEBA?AVno_immediate_command_list@0property@intel@ext@12@XZ ??$get_property@Vno_init@property@_V1@sycl@@@buffer_plain@detail@_V1@sycl@@IEBA?AVno_init@property@23@XZ ??$get_property@Vno_init@property@_V1@sycl@@@context@_V1@sycl@@QEBA?AVno_init@property@12@XZ ??$get_property@Vno_init@property@_V1@sycl@@@image_plain@detail@_V1@sycl@@IEBA?AVno_init@property@23@XZ @@ -313,6 +315,7 @@ ??$has_property@Vcontext_bound@image@property@_V1@sycl@@@stream@_V1@sycl@@QEBA_NXZ ??$has_property@Vdiscard_events@queue@property@oneapi@ext@_V1@sycl@@@queue@_V1@sycl@@QEBA_NXZ ??$has_property@Venable_profiling@queue@property@_V1@sycl@@@queue@_V1@sycl@@QEBA_NXZ +??$has_property@Vimmediate_command_list@queue@property@intel@ext@_V1@sycl@@@queue@_V1@sycl@@QEBA_NXZ ??$has_property@Vin_order@queue@property@_V1@sycl@@@buffer_plain@detail@_V1@sycl@@IEBA_NXZ ??$has_property@Vin_order@queue@property@_V1@sycl@@@context@_V1@sycl@@QEBA_NXZ ??$has_property@Vin_order@queue@property@_V1@sycl@@@image_plain@detail@_V1@sycl@@IEBA_NXZ @@ -329,6 +332,7 @@ ??$has_property@Vmem_channel@buffer@property@_V1@sycl@@@image_plain@detail@_V1@sycl@@IEBA_NXZ ??$has_property@Vmem_channel@buffer@property@_V1@sycl@@@sampler@_V1@sycl@@QEBA_NXZ ??$has_property@Vmem_channel@buffer@property@_V1@sycl@@@stream@_V1@sycl@@QEBA_NXZ +??$has_property@Vno_immediate_command_list@queue@property@intel@ext@_V1@sycl@@@queue@_V1@sycl@@QEBA_NXZ ??$has_property@Vno_init@property@_V1@sycl@@@buffer_plain@detail@_V1@sycl@@IEBA_NXZ ??$has_property@Vno_init@property@_V1@sycl@@@context@_V1@sycl@@QEBA_NXZ ??$has_property@Vno_init@property@_V1@sycl@@@image_plain@detail@_V1@sycl@@IEBA_NXZ