diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp index 83f7181f004ed..18be59dec9ed0 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp @@ -98,7 +98,7 @@ ur_result_t ur_queue_immediate_out_of_order_t::queueGetInfo( ur_result_t ur_queue_immediate_out_of_order_t::queueGetNativeHandle( ur_queue_native_desc_t *pDesc, ur_native_handle_t *phNativeQueue) { *phNativeQueue = reinterpret_cast( - (*commandListManagers.get_no_lock())[getNextCommandListId()] + (*commandListManagers.get_no_lock())[getNextCommandListId(false)] .getZeCommandList()); if (pDesc && pDesc->pNativeData) { // pNativeData == isImmediateQueue @@ -112,10 +112,16 @@ ur_result_t ur_queue_immediate_out_of_order_t::queueFinish() { auto commandListManagersLocked = commandListManagers.lock(); + // Only synchronize command lists that have been used to avoid unnecessary + // synchronization overhead. + uint32_t usedMask = + usedCommandListsMask.exchange(0, std::memory_order_relaxed); for (size_t i = 0; i < numCommandLists; i++) { - ZE2UR_CALL(zeCommandListHostSynchronize, - (commandListManagersLocked[i].getZeCommandList(), UINT64_MAX)); - UR_CALL(commandListManagersLocked[i].releaseSubmittedKernels()); + if (usedMask & (1u << i)) { + ZE2UR_CALL(zeCommandListHostSynchronize, + (commandListManagersLocked[i].getZeCommandList(), UINT64_MAX)); + UR_CALL(commandListManagersLocked[i].releaseSubmittedKernels()); + } } hContext->getAsyncPool()->cleanupPoolsForQueue(this); @@ -164,6 +170,11 @@ ur_result_t ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrier( auto commandListManagersLocked = commandListManagers.lock(); + // Mark the command list as used; queueFinish() must then synchronize every + // list touched here. + usedCommandListsMask.fetch_or((1u << numCommandLists) - 1, + std::memory_order_relaxed); + // Enqueue wait for the user-provider events on the first command list. UR_CALL(commandListManagersLocked[0].appendEventsWait(waitListView, barrierEvents[0])); diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp index 06b16b9f640e0..bc51b572cfb5e 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp @@ -45,20 +45,33 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { lockable> commandListManagers; + // Track which command lists have pending work to avoid unnecessary + // synchronization in queueFinish(). Each bit represents one command list. + std::atomic usedCommandListsMask = 0; + ur_queue_flags_t flags; std::array barrierEvents; - uint32_t getNextCommandListId() { + uint32_t getNextCommandListId(bool markUsed = true) { bool isGraphCaptureActive; auto &cmdListManager = (*commandListManagers.get_no_lock())[captureCmdListManagerIdx]; cmdListManager.isGraphCaptureActive(&isGraphCaptureActive); - return isGraphCaptureActive - ? captureCmdListManagerIdx - : commandListIndex.fetch_add(1, std::memory_order_relaxed) % - numCommandLists; + uint32_t id = + isGraphCaptureActive + ? captureCmdListManagerIdx + : commandListIndex.fetch_add(1, std::memory_order_relaxed) % + numCommandLists; + + if (markUsed) { + // Mark this command list as used so queueFinish() synchronizes only + // lists that actually carried work. + usedCommandListsMask.fetch_or(1u << id, std::memory_order_relaxed); + } + + return id; } public: