From ab7b949c0a396f572782f45b1a7f95400fc25e4e Mon Sep 17 00:00:00 2001 From: Kevin B Smith Date: Thu, 29 Oct 2020 15:49:19 -0700 Subject: [PATCH 1/3] [SYCL][PI][L0] Fixes to make sure kernels and programs cannot be destroyed before the have finished execution. --- sycl/plugins/level_zero/pi_level_zero.cpp | 81 ++++++++++++++++++----- sycl/plugins/level_zero/pi_level_zero.hpp | 4 +- 2 files changed, 66 insertions(+), 19 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index c4463473a2759..0534758f95cdf 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -572,8 +572,9 @@ bool _pi_queue::isBatchingAllowed() { } pi_result _pi_queue::batchCommandList(ze_command_list_handle_t ZeCommandList, - ze_fence_handle_t ZeFence) { - if (this->isBatchingAllowed()) { + ze_fence_handle_t ZeFence, + bool OKToBatchKernel) { + if (OKToBatchKernel && this->isBatchingAllowed()) { assert(this->ZeOpenCommandList == nullptr || this->ZeOpenCommandList == ZeCommandList); @@ -2759,12 +2760,16 @@ pi_result piextProgramCreateWithNativeHandle(pi_native_handle NativeHandle, } _pi_program::~_pi_program() { - if (ZeModule) { - ZE_CALL_NOCHECK(zeModuleDestroy(ZeModule)); - } + // According to Level Zero Specification, all kernels and build logs + // must be destroyed before the Module can be destroyed. So, be sure + // to destroy build log before destroying the module. if (ZeBuildLog) { ZE_CALL_NOCHECK(zeModuleBuildLogDestroy(ZeBuildLog)); } + + if (ZeModule) { + ZE_CALL_NOCHECK(zeModuleDestroy(ZeModule)); + } } _pi_program::LinkedReleaser::~LinkedReleaser() { @@ -2902,6 +2907,10 @@ pi_result piKernelCreate(pi_program Program, const char *KernelName, } catch (...) { return PI_ERROR_UNKNOWN; } + + // Update the refcount of the program to show its use by this kernel. + piProgramRetain(Program); + return PI_SUCCESS; } @@ -3091,16 +3100,24 @@ pi_result piKernelRetain(pi_kernel Kernel) { assert(Kernel); ++(Kernel->RefCount); + // When retaining a kernel, you are also retaining the program it is part of. + piProgramRetain(Kernel->Program); return PI_SUCCESS; } pi_result piKernelRelease(pi_kernel Kernel) { assert(Kernel); + auto KernelProgram = Kernel->Program; + if (--(Kernel->RefCount) == 0) { zeKernelDestroy(Kernel->ZeKernel); delete Kernel; } + + // do a release on the program this kernel was part of + piProgramRelease(KernelProgram); + return PI_SUCCESS; } @@ -3186,11 +3203,19 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, // Lock automatically releases when this goes out of scope. std::lock_guard lock(Queue->PiQueueMutex); + // It is only OK to batch this kernel if there is an event that can be used + // to release the kernel once it has been executed. So we will not allow + // these to be batched, only executed immediately. + // Should this also force a synchronize immediately? That would also + // ensure that the kernel had completed execution before it could be + // released, but that may really hurt performance. + bool BatchingThisKernelOK = (Event); + // Get a new command list to be used on this call ze_command_list_handle_t ZeCommandList = nullptr; ze_fence_handle_t ZeFence = nullptr; - if (auto Res = Queue->Device->getAvailableCommandList(Queue, &ZeCommandList, - &ZeFence, true)) + if (auto Res = Queue->Device->getAvailableCommandList( + Queue, &ZeCommandList, &ZeFence, BatchingThisKernelOK)) return Res; ze_event_handle_t ZeEvent = nullptr; @@ -3202,6 +3227,13 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, (*Event)->Queue = Queue; (*Event)->CommandType = PI_COMMAND_TYPE_NDRANGE_KERNEL; (*Event)->ZeCommandList = ZeCommandList; + (*Event)->CommandData = (void *)Kernel; + + // Use piKernelRetain to increment the reference count and indicate + // that the Kernel is in use. Once the event has been signaled, the + // code in cleanupAfterEvent will do a piReleaseKernel to update + // the reference count on the kernel. + piKernelRetain(Kernel); ZeEvent = (*Event)->ZeEvent; } @@ -3227,7 +3259,8 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, // Execute command list asynchronously, as the event will be used // to track down its completion. - if (auto Res = Queue->batchCommandList(ZeCommandList, ZeFence)) + if (auto Res = + Queue->batchCommandList(ZeCommandList, ZeFence, BatchingThisKernelOK)) return Res; _pi_event::deleteZeEventList(ZeEventWaitList); @@ -3356,21 +3389,26 @@ pi_result piEventGetProfilingInfo(pi_event Event, pi_profiling_info ParamName, return PI_SUCCESS; } -// Recycle the command list associated with this event. -static void recycleEventCommandList(pi_event Event) { +// Perform any necessary cleanup after an event has been signalled. +// This currently recycles the associate command list, and also makes +// sure to release any kernel that may have been used by the event. +static void cleanupAfterEvent(pi_event Event) { // The implementation of this is slightly tricky. The same event // can be referred to by multiple threads, so it is possible to - // have a race condition between the read of ZeCommandList and - // it being reset to nullptr in another thread. - // But, since the ZeCommandList is uniquely associated with the queue + // have a race condition between the read of fields of the event, + // and resetiing those fields in some other thread. + // But, since the event is uniquely associated with the queue // for the event, we use the locking that we already have to do on the // queue to also serve as the thread safety mechanism for the - // Event's ZeCommandList. + // any of the Event's data members that need to be read/reset as + // part of the cleanup operations. auto Queue = Event->Queue; // Lock automatically releases when this goes out of scope. std::lock_guard lock(Queue->PiQueueMutex); + // Cleanup the command list associated with the event if it hasn't + // been cleaned up already. auto EventCommandList = Event->ZeCommandList; if (EventCommandList) { @@ -3386,6 +3424,13 @@ static void recycleEventCommandList(pi_event Event) { } } } + + // Release the kernel associated with this event if there is one. + if (Event->CommandType == PI_COMMAND_TYPE_NDRANGE_KERNEL && + Event->CommandData) { + piKernelRelease((pi_kernel)(Event->CommandData)); + Event->CommandData = nullptr; + } } pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { @@ -3412,9 +3457,9 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { zePrint("ZeEvent = %lx\n", pi_cast(ZeEvent)); ZE_CALL(zeEventHostSynchronize(ZeEvent, UINT32_MAX)); - // NOTE: we are destroying associated command lists here to free - // resources sooner in case RT is not calling piEventRelease soon enough. - recycleEventCommandList(EventList[I]); + // NOTE: we are cleaning up after the event here to free resources + // sooner in case run-time is not calling piEventRelease soon enough. + cleanupAfterEvent(EventList[I]); } return PI_SUCCESS; } @@ -3441,7 +3486,7 @@ pi_result piEventRetain(pi_event Event) { pi_result piEventRelease(pi_event Event) { assert(Event); if (--(Event->RefCount) == 0) { - recycleEventCommandList(Event); + cleanupAfterEvent(Event); if (Event->CommandType == PI_COMMAND_TYPE_MEM_BUFFER_UNMAP && Event->CommandData) { diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index fe9566e172146..3bf442d72099d 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -328,8 +328,10 @@ struct _pi_queue : _pi_object { // Attach a command list to this queue and allow it to remain open // and used for further batching. It may be executed immediately, // or it may be left open for other future command to be batched into. + // OKToBatchKernel indicates whether for this particular kernel in the + // command list whether batching is allowed. pi_result batchCommandList(ze_command_list_handle_t ZeCommandList, - ze_fence_handle_t ZeFence); + ze_fence_handle_t ZeFence, bool OKToBatchKernel); // Attach a command list to this queue, close, and execute it. // Note that this command list cannot be appended to after this. From 48afabc58c194eaa2929fa57179054926e3d4370 Mon Sep 17 00:00:00 2001 From: Kevin B Smith Date: Fri, 30 Oct 2020 13:35:40 -0700 Subject: [PATCH 2/3] Address comments from initial code review. --- sycl/plugins/level_zero/pi_level_zero.cpp | 67 +++++++++++------------ sycl/plugins/level_zero/pi_level_zero.hpp | 18 +++--- 2 files changed, 41 insertions(+), 44 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 0534758f95cdf..4a9e47260a3e0 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -426,7 +426,7 @@ pi_result _pi_device::initialize() { pi_result _pi_queue::resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList, bool MakeAvailable) { - // Event has been signaled: If the fence for the associated command list + // Event has been signalled: If the fence for the associated command list // is signalled, then reset the fence and command list and add them to the // available list for ruse in PI calls. ZE_CALL(zeFenceReset(this->ZeCommandListFenceMap[ZeCommandList])); @@ -552,29 +552,9 @@ pi_result _pi_device::getAvailableCommandList( pi_result _pi_queue::executeCommandList(ze_command_list_handle_t ZeCommandList, ze_fence_handle_t ZeFence, - bool IsBlocking) { - // Close the command list and have it ready for dispatch. - ZE_CALL(zeCommandListClose(ZeCommandList)); - // Offload command list to the GPU for asynchronous execution - ZE_CALL(zeCommandQueueExecuteCommandLists(ZeCommandQueue, 1, &ZeCommandList, - ZeFence)); - - // Check global control to make every command blocking for debugging. - if (IsBlocking || (ZeSerialize & ZeSerializeBlock) != 0) { - // Wait until command lists attached to the command queue are executed. - ZE_CALL(zeCommandQueueSynchronize(ZeCommandQueue, UINT32_MAX)); - } - return PI_SUCCESS; -} - -bool _pi_queue::isBatchingAllowed() { - return (this->QueueBatchSize > 1 && ((ZeSerialize & ZeSerializeBlock) == 0)); -} - -pi_result _pi_queue::batchCommandList(ze_command_list_handle_t ZeCommandList, - ze_fence_handle_t ZeFence, - bool OKToBatchKernel) { - if (OKToBatchKernel && this->isBatchingAllowed()) { + bool IsBlocking, + bool OKToBatchCommand) { + if (OKToBatchCommand && this->isBatchingAllowed()) { assert(this->ZeOpenCommandList == nullptr || this->ZeOpenCommandList == ZeCommandList); @@ -597,7 +577,22 @@ pi_result _pi_queue::batchCommandList(ze_command_list_handle_t ZeCommandList, this->ZeOpenCommandListSize = 0; } - return executeCommandList(ZeCommandList, ZeFence); + // Close the command list and have it ready for dispatch. + ZE_CALL(zeCommandListClose(ZeCommandList)); + // Offload command list to the GPU for asynchronous execution + ZE_CALL(zeCommandQueueExecuteCommandLists(ZeCommandQueue, 1, &ZeCommandList, + ZeFence)); + + // Check global control to make every command blocking for debugging. + if (IsBlocking || (ZeSerialize & ZeSerializeBlock) != 0) { + // Wait until command lists attached to the command queue are executed. + ZE_CALL(zeCommandQueueSynchronize(ZeCommandQueue, UINT32_MAX)); + } + return PI_SUCCESS; +} + +bool _pi_queue::isBatchingAllowed() { + return (this->QueueBatchSize > 1 && ((ZeSerialize & ZeSerializeBlock) == 0)); } pi_result _pi_queue::executeOpenCommandList() { @@ -3209,13 +3204,13 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, // Should this also force a synchronize immediately? That would also // ensure that the kernel had completed execution before it could be // released, but that may really hurt performance. - bool BatchingThisKernelOK = (Event); + bool BatchingThisCommandOK = (Event); // Get a new command list to be used on this call ze_command_list_handle_t ZeCommandList = nullptr; ze_fence_handle_t ZeFence = nullptr; if (auto Res = Queue->Device->getAvailableCommandList( - Queue, &ZeCommandList, &ZeFence, BatchingThisKernelOK)) + Queue, &ZeCommandList, &ZeFence, BatchingThisCommandOK)) return Res; ze_event_handle_t ZeEvent = nullptr; @@ -3227,12 +3222,16 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, (*Event)->Queue = Queue; (*Event)->CommandType = PI_COMMAND_TYPE_NDRANGE_KERNEL; (*Event)->ZeCommandList = ZeCommandList; + + // Save the kernel in the event, so that when the event is signalled + // the code can do a piKernelRelease on this kernel. (*Event)->CommandData = (void *)Kernel; // Use piKernelRetain to increment the reference count and indicate - // that the Kernel is in use. Once the event has been signaled, the + // that the Kernel is in use. Once the event has been signalled, the // code in cleanupAfterEvent will do a piReleaseKernel to update - // the reference count on the kernel. + // the reference count on the kernel, using the kernel saved + // in CommandData. piKernelRetain(Kernel); ZeEvent = (*Event)->ZeEvent; @@ -3259,8 +3258,8 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, // Execute command list asynchronously, as the event will be used // to track down its completion. - if (auto Res = - Queue->batchCommandList(ZeCommandList, ZeFence, BatchingThisKernelOK)) + if (auto Res = Queue->executeCommandList(ZeCommandList, ZeFence, false, + BatchingThisCommandOK)) return Res; _pi_event::deleteZeEventList(ZeEventWaitList); @@ -3396,7 +3395,7 @@ static void cleanupAfterEvent(pi_event Event) { // The implementation of this is slightly tricky. The same event // can be referred to by multiple threads, so it is possible to // have a race condition between the read of fields of the event, - // and resetiing those fields in some other thread. + // and reseting those fields in some other thread. // But, since the event is uniquely associated with the queue // for the event, we use the locking that we already have to do on the // queue to also serve as the thread safety mechanism for the @@ -3412,7 +3411,7 @@ static void cleanupAfterEvent(pi_event Event) { auto EventCommandList = Event->ZeCommandList; if (EventCommandList) { - // Event has been signaled: If the fence for the associated command list + // Event has been signalled: If the fence for the associated command list // is signalled, then reset the fence and command list and add them to the // available list for reuse in PI calls. if (Queue->RefCount > 0) { @@ -3428,7 +3427,7 @@ static void cleanupAfterEvent(pi_event Event) { // Release the kernel associated with this event if there is one. if (Event->CommandType == PI_COMMAND_TYPE_NDRANGE_KERNEL && Event->CommandData) { - piKernelRelease((pi_kernel)(Event->CommandData)); + piKernelRelease(pi_cast(Event->CommandData)); Event->CommandData = nullptr; } } diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 3bf442d72099d..c552df2556fb3 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -325,22 +325,20 @@ struct _pi_queue : _pi_object { pi_result resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList, bool MakeAvailable); - // Attach a command list to this queue and allow it to remain open - // and used for further batching. It may be executed immediately, - // or it may be left open for other future command to be batched into. - // OKToBatchKernel indicates whether for this particular kernel in the - // command list whether batching is allowed. - pi_result batchCommandList(ze_command_list_handle_t ZeCommandList, - ze_fence_handle_t ZeFence, bool OKToBatchKernel); - // Attach a command list to this queue, close, and execute it. // Note that this command list cannot be appended to after this. - // The "IsBlocking" tells if the wait for completion is requested. + // The "IsBlocking" tells if the wait for completion is required. // The "ZeFence" passed is used to track when the command list passed // has completed execution on the device and can be reused. + // If OKToBatchCommand is true, then this command list may be executed + // immediately, or it may be left open for other future command to be + // batched into. + // If IsBlocking is true, then batching will not be allowed regardless + // of the value of OKToBatchCommand pi_result executeCommandList(ze_command_list_handle_t ZeCommandList, ze_fence_handle_t ZeFence, - bool IsBlocking = false); + bool IsBlocking = false, + bool OKToBatchCommand = false); // If there is an open command list associated with this queue, // close it, exceute it, and reset ZeOpenCommandList, ZeCommandListFence, From f6145c38e1b0503aac693ef5a87d69ffc8899f64 Mon Sep 17 00:00:00 2001 From: Kevin B Smith Date: Mon, 2 Nov 2020 20:59:03 -0800 Subject: [PATCH 3/3] More changes based on code review feedback. --- sycl/plugins/level_zero/pi_level_zero.cpp | 50 +++++++++-------------- 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 4a9e47260a3e0..ca77f6e271285 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -3124,6 +3124,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, const pi_event *EventWaitList, pi_event *Event) { assert(Kernel); assert(Queue); + assert(Event); assert((WorkDim > 0) && (WorkDim < 4)); if (GlobalWorkOffset != NULL) { for (pi_uint32 i = 0; i < WorkDim; i++) { @@ -3198,44 +3199,34 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, // Lock automatically releases when this goes out of scope. std::lock_guard lock(Queue->PiQueueMutex); - // It is only OK to batch this kernel if there is an event that can be used - // to release the kernel once it has been executed. So we will not allow - // these to be batched, only executed immediately. - // Should this also force a synchronize immediately? That would also - // ensure that the kernel had completed execution before it could be - // released, but that may really hurt performance. - bool BatchingThisCommandOK = (Event); - // Get a new command list to be used on this call ze_command_list_handle_t ZeCommandList = nullptr; ze_fence_handle_t ZeFence = nullptr; - if (auto Res = Queue->Device->getAvailableCommandList( - Queue, &ZeCommandList, &ZeFence, BatchingThisCommandOK)) + if (auto Res = Queue->Device->getAvailableCommandList(Queue, &ZeCommandList, + &ZeFence, true)) return Res; ze_event_handle_t ZeEvent = nullptr; - if (Event) { - auto Res = piEventCreate(Kernel->Program->Context, Event); - if (Res != PI_SUCCESS) - return Res; + auto Res = piEventCreate(Kernel->Program->Context, Event); + if (Res != PI_SUCCESS) + return Res; - (*Event)->Queue = Queue; - (*Event)->CommandType = PI_COMMAND_TYPE_NDRANGE_KERNEL; - (*Event)->ZeCommandList = ZeCommandList; + (*Event)->Queue = Queue; + (*Event)->CommandType = PI_COMMAND_TYPE_NDRANGE_KERNEL; + (*Event)->ZeCommandList = ZeCommandList; - // Save the kernel in the event, so that when the event is signalled - // the code can do a piKernelRelease on this kernel. - (*Event)->CommandData = (void *)Kernel; + // Save the kernel in the event, so that when the event is signalled + // the code can do a piKernelRelease on this kernel. + (*Event)->CommandData = (void *)Kernel; - // Use piKernelRetain to increment the reference count and indicate - // that the Kernel is in use. Once the event has been signalled, the - // code in cleanupAfterEvent will do a piReleaseKernel to update - // the reference count on the kernel, using the kernel saved - // in CommandData. - piKernelRetain(Kernel); + // Use piKernelRetain to increment the reference count and indicate + // that the Kernel is in use. Once the event has been signalled, the + // code in cleanupAfterEvent will do a piReleaseKernel to update + // the reference count on the kernel, using the kernel saved + // in CommandData. + piKernelRetain(Kernel); - ZeEvent = (*Event)->ZeEvent; - } + ZeEvent = (*Event)->ZeEvent; ze_event_handle_t *ZeEventWaitList = _pi_event::createZeEventList(NumEventsInWaitList, EventWaitList); @@ -3258,8 +3249,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, // Execute command list asynchronously, as the event will be used // to track down its completion. - if (auto Res = Queue->executeCommandList(ZeCommandList, ZeFence, false, - BatchingThisCommandOK)) + if (auto Res = Queue->executeCommandList(ZeCommandList, ZeFence, false, true)) return Res; _pi_event::deleteZeEventList(ZeEventWaitList);