From e4ecd60c1c48cab1308e361923a893dac99d060a Mon Sep 17 00:00:00 2001 From: Piotr Balcer Date: Thu, 25 Apr 2024 09:24:10 +0000 Subject: [PATCH] fix queue locking behavior when creating event lists This fixes a race when creating queue events without acquiring the appropriate locks, and also fixes a deadlock when acquiring multiple queue locks. The deadlock scenario is roughly this: queue1 = ...; queue2 = ...; for (;;) { queue1_event = urEnqueueKernelLaunch(queue1, ...); // lock order: queue2->Mutex, queue1->Mutex urEnqueueEventsWait(queue2, 1, [queue1_event], nullptr); } T2: for (;;) { queue2_event = urEnqueueKernelLaunch(queue2, ...); // lock order: queue1->Mutex, queue2->Mutex urEnqueueEventsWait(queue1, 1, [queue2_event], nullptr); } The solution in this patch is the simplest possible fix I managed to figure out, but I strongly believe this code requires a substantial refactor to better structure the synchronization logic. Right now it's a minefield. --- source/adapters/level_zero/event.cpp | 36 +++++++++++++++++++--------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index 97ffe2f19e..36305ea0fd 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -1261,16 +1261,26 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( } auto Queue = EventList[I]->UrQueue; - if (Queue) { - // The caller of createAndRetainUrZeEventList must already hold - // a lock of the CurQueue. Additionally lock the Queue if it - // is different from CurQueue. - // TODO: rework this to avoid deadlock when another thread is - // locking the same queues but in a different order. - auto Lock = ((Queue == CurQueue) - ? std::unique_lock() - : std::unique_lock(Queue->Mutex)); + auto CurQueueDevice = CurQueue->Device; + std::optional> QueueLock = + std::nullopt; + // The caller of createAndRetainUrZeEventList must already hold + // a lock of the CurQueue. However, if the CurQueue is different + // then the Event's Queue, we need to drop that lock and + // acquire the Event's Queue lock. This is done to avoid a lock + // ordering issue. + // For the rest of this scope, CurQueue cannot be accessed. + // TODO: This solution is very error-prone. This requires a refactor + // to either have fine-granularity locks inside of the queues or + // to move any operations on queues other than CurQueue out + // of this scope. + if (Queue && Queue != CurQueue) { + CurQueue->Mutex.unlock(); + QueueLock = std::unique_lock(Queue->Mutex); + } + + if (Queue) { // If the event that is going to be waited is in an open batch // different from where this next command is going to be added, // then we have to force execute of that open command-list @@ -1313,7 +1323,7 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( } ur_command_list_ptr_t CommandList; - if (Queue && Queue->Device != CurQueue->Device) { + if (Queue && Queue->Device != CurQueueDevice) { // Get a command list prior to acquiring an event lock. // This prevents a potential deadlock with recursive // event locks. @@ -1323,7 +1333,7 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( std::shared_lock Lock(EventList[I]->Mutex); - if (Queue && Queue->Device != CurQueue->Device && + if (Queue && Queue->Device != CurQueueDevice && !EventList[I]->IsMultiDevice) { ze_event_handle_t MultiDeviceZeEvent = nullptr; ur_event_handle_t MultiDeviceEvent; @@ -1357,6 +1367,10 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( this->UrEventList[TmpListLength]->RefCount.increment(); } + if (QueueLock.has_value()) { + QueueLock.reset(); + CurQueue->Mutex.lock(); + } TmpListLength += 1; } }