@@ -426,7 +426,7 @@ pi_result _pi_device::initialize() {
426426pi_result
427427_pi_queue::resetCommandListFenceEntry (ze_command_list_handle_t ZeCommandList,
428428 bool MakeAvailable) {
429- // Event has been signaled : If the fence for the associated command list
429+ // Event has been signalled : If the fence for the associated command list
430430 // is signalled, then reset the fence and command list and add them to the
431431 // available list for ruse in PI calls.
432432 ZE_CALL (zeFenceReset (this ->ZeCommandListFenceMap [ZeCommandList]));
@@ -552,28 +552,9 @@ pi_result _pi_device::getAvailableCommandList(
552552
553553pi_result _pi_queue::executeCommandList (ze_command_list_handle_t ZeCommandList,
554554 ze_fence_handle_t ZeFence,
555- bool IsBlocking) {
556- // Close the command list and have it ready for dispatch.
557- ZE_CALL (zeCommandListClose (ZeCommandList));
558- // Offload command list to the GPU for asynchronous execution
559- ZE_CALL (zeCommandQueueExecuteCommandLists (ZeCommandQueue, 1 , &ZeCommandList,
560- ZeFence));
561-
562- // Check global control to make every command blocking for debugging.
563- if (IsBlocking || (ZeSerialize & ZeSerializeBlock) != 0 ) {
564- // Wait until command lists attached to the command queue are executed.
565- ZE_CALL (zeCommandQueueSynchronize (ZeCommandQueue, UINT32_MAX));
566- }
567- return PI_SUCCESS;
568- }
569-
570- bool _pi_queue::isBatchingAllowed () {
571- return (this ->QueueBatchSize > 1 && ((ZeSerialize & ZeSerializeBlock) == 0 ));
572- }
573-
574- pi_result _pi_queue::batchCommandList (ze_command_list_handle_t ZeCommandList,
575- ze_fence_handle_t ZeFence) {
576- if (this ->isBatchingAllowed ()) {
555+ bool IsBlocking,
556+ bool OKToBatchCommand) {
557+ if (OKToBatchCommand && this ->isBatchingAllowed ()) {
577558 assert (this ->ZeOpenCommandList == nullptr ||
578559 this ->ZeOpenCommandList == ZeCommandList);
579560
@@ -596,7 +577,22 @@ pi_result _pi_queue::batchCommandList(ze_command_list_handle_t ZeCommandList,
596577 this ->ZeOpenCommandListSize = 0 ;
597578 }
598579
599- return executeCommandList (ZeCommandList, ZeFence);
580+ // Close the command list and have it ready for dispatch.
581+ ZE_CALL (zeCommandListClose (ZeCommandList));
582+ // Offload command list to the GPU for asynchronous execution
583+ ZE_CALL (zeCommandQueueExecuteCommandLists (ZeCommandQueue, 1 , &ZeCommandList,
584+ ZeFence));
585+
586+ // Check global control to make every command blocking for debugging.
587+ if (IsBlocking || (ZeSerialize & ZeSerializeBlock) != 0 ) {
588+ // Wait until command lists attached to the command queue are executed.
589+ ZE_CALL (zeCommandQueueSynchronize (ZeCommandQueue, UINT32_MAX));
590+ }
591+ return PI_SUCCESS;
592+ }
593+
594+ bool _pi_queue::isBatchingAllowed () {
595+ return (this ->QueueBatchSize > 1 && ((ZeSerialize & ZeSerializeBlock) == 0 ));
600596}
601597
602598pi_result _pi_queue::executeOpenCommandList () {
@@ -2759,12 +2755,16 @@ pi_result piextProgramCreateWithNativeHandle(pi_native_handle NativeHandle,
27592755}
27602756
27612757_pi_program::~_pi_program () {
2762- if (ZeModule) {
2763- ZE_CALL_NOCHECK ( zeModuleDestroy (ZeModule));
2764- }
2758+ // According to Level Zero Specification, all kernels and build logs
2759+ // must be destroyed before the Module can be destroyed. So, be sure
2760+ // to destroy build log before destroying the module.
27652761 if (ZeBuildLog) {
27662762 ZE_CALL_NOCHECK (zeModuleBuildLogDestroy (ZeBuildLog));
27672763 }
2764+
2765+ if (ZeModule) {
2766+ ZE_CALL_NOCHECK (zeModuleDestroy (ZeModule));
2767+ }
27682768}
27692769
27702770_pi_program::LinkedReleaser::~LinkedReleaser () {
@@ -2902,6 +2902,10 @@ pi_result piKernelCreate(pi_program Program, const char *KernelName,
29022902 } catch (...) {
29032903 return PI_ERROR_UNKNOWN;
29042904 }
2905+
2906+ // Update the refcount of the program to show its use by this kernel.
2907+ piProgramRetain (Program);
2908+
29052909 return PI_SUCCESS;
29062910}
29072911
@@ -3091,16 +3095,24 @@ pi_result piKernelRetain(pi_kernel Kernel) {
30913095
30923096 assert (Kernel);
30933097 ++(Kernel->RefCount );
3098+ // When retaining a kernel, you are also retaining the program it is part of.
3099+ piProgramRetain (Kernel->Program );
30943100 return PI_SUCCESS;
30953101}
30963102
30973103pi_result piKernelRelease (pi_kernel Kernel) {
30983104
30993105 assert (Kernel);
3106+ auto KernelProgram = Kernel->Program ;
3107+
31003108 if (--(Kernel->RefCount ) == 0 ) {
31013109 zeKernelDestroy (Kernel->ZeKernel );
31023110 delete Kernel;
31033111 }
3112+
3113+ // do a release on the program this kernel was part of
3114+ piProgramRelease (KernelProgram);
3115+
31043116 return PI_SUCCESS;
31053117}
31063118
@@ -3112,6 +3124,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
31123124 const pi_event *EventWaitList, pi_event *Event) {
31133125 assert (Kernel);
31143126 assert (Queue);
3127+ assert (Event);
31153128 assert ((WorkDim > 0 ) && (WorkDim < 4 ));
31163129 if (GlobalWorkOffset != NULL ) {
31173130 for (pi_uint32 i = 0 ; i < WorkDim; i++) {
@@ -3194,17 +3207,26 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
31943207 return Res;
31953208
31963209 ze_event_handle_t ZeEvent = nullptr ;
3197- if (Event) {
3198- auto Res = piEventCreate (Kernel->Program ->Context , Event);
3199- if (Res != PI_SUCCESS)
3200- return Res;
3210+ auto Res = piEventCreate (Kernel->Program ->Context , Event);
3211+ if (Res != PI_SUCCESS)
3212+ return Res;
32013213
3202- (*Event)->Queue = Queue;
3203- (*Event)->CommandType = PI_COMMAND_TYPE_NDRANGE_KERNEL;
3204- (*Event)->ZeCommandList = ZeCommandList;
3214+ (*Event)->Queue = Queue;
3215+ (*Event)->CommandType = PI_COMMAND_TYPE_NDRANGE_KERNEL;
3216+ (*Event)->ZeCommandList = ZeCommandList;
32053217
3206- ZeEvent = (*Event)->ZeEvent ;
3207- }
3218+ // Save the kernel in the event, so that when the event is signalled
3219+ // the code can do a piKernelRelease on this kernel.
3220+ (*Event)->CommandData = (void *)Kernel;
3221+
3222+ // Use piKernelRetain to increment the reference count and indicate
3223+ // that the Kernel is in use. Once the event has been signalled, the
3224+ // code in cleanupAfterEvent will do a piReleaseKernel to update
3225+ // the reference count on the kernel, using the kernel saved
3226+ // in CommandData.
3227+ piKernelRetain (Kernel);
3228+
3229+ ZeEvent = (*Event)->ZeEvent ;
32083230
32093231 ze_event_handle_t *ZeEventWaitList =
32103232 _pi_event::createZeEventList (NumEventsInWaitList, EventWaitList);
@@ -3227,7 +3249,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
32273249
32283250 // Execute command list asynchronously, as the event will be used
32293251 // to track down its completion.
3230- if (auto Res = Queue->batchCommandList (ZeCommandList, ZeFence))
3252+ if (auto Res = Queue->executeCommandList (ZeCommandList, ZeFence, false , true ))
32313253 return Res;
32323254
32333255 _pi_event::deleteZeEventList (ZeEventWaitList);
@@ -3356,25 +3378,30 @@ pi_result piEventGetProfilingInfo(pi_event Event, pi_profiling_info ParamName,
33563378 return PI_SUCCESS;
33573379}
33583380
3359- // Recycle the command list associated with this event.
3360- static void recycleEventCommandList (pi_event Event) {
3381+ // Perform any necessary cleanup after an event has been signalled.
3382+ // This currently recycles the associate command list, and also makes
3383+ // sure to release any kernel that may have been used by the event.
3384+ static void cleanupAfterEvent (pi_event Event) {
33613385 // The implementation of this is slightly tricky. The same event
33623386 // can be referred to by multiple threads, so it is possible to
3363- // have a race condition between the read of ZeCommandList and
3364- // it being reset to nullptr in another thread.
3365- // But, since the ZeCommandList is uniquely associated with the queue
3387+ // have a race condition between the read of fields of the event,
3388+ // and reseting those fields in some other thread.
3389+ // But, since the event is uniquely associated with the queue
33663390 // for the event, we use the locking that we already have to do on the
33673391 // queue to also serve as the thread safety mechanism for the
3368- // Event's ZeCommandList.
3392+ // any of the Event's data members that need to be read/reset as
3393+ // part of the cleanup operations.
33693394 auto Queue = Event->Queue ;
33703395
33713396 // Lock automatically releases when this goes out of scope.
33723397 std::lock_guard<std::mutex> lock (Queue->PiQueueMutex );
33733398
3399+ // Cleanup the command list associated with the event if it hasn't
3400+ // been cleaned up already.
33743401 auto EventCommandList = Event->ZeCommandList ;
33753402
33763403 if (EventCommandList) {
3377- // Event has been signaled : If the fence for the associated command list
3404+ // Event has been signalled : If the fence for the associated command list
33783405 // is signalled, then reset the fence and command list and add them to the
33793406 // available list for reuse in PI calls.
33803407 if (Queue->RefCount > 0 ) {
@@ -3386,6 +3413,13 @@ static void recycleEventCommandList(pi_event Event) {
33863413 }
33873414 }
33883415 }
3416+
3417+ // Release the kernel associated with this event if there is one.
3418+ if (Event->CommandType == PI_COMMAND_TYPE_NDRANGE_KERNEL &&
3419+ Event->CommandData ) {
3420+ piKernelRelease (pi_cast<pi_kernel>(Event->CommandData ));
3421+ Event->CommandData = nullptr ;
3422+ }
33893423}
33903424
33913425pi_result piEventsWait (pi_uint32 NumEvents, const pi_event *EventList) {
@@ -3412,9 +3446,9 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
34123446 zePrint (" ZeEvent = %lx\n " , pi_cast<std::uintptr_t >(ZeEvent));
34133447 ZE_CALL (zeEventHostSynchronize (ZeEvent, UINT32_MAX));
34143448
3415- // NOTE: we are destroying associated command lists here to free
3416- // resources sooner in case RT is not calling piEventRelease soon enough.
3417- recycleEventCommandList (EventList[I]);
3449+ // NOTE: we are cleaning up after the event here to free resources
3450+ // sooner in case run-time is not calling piEventRelease soon enough.
3451+ cleanupAfterEvent (EventList[I]);
34183452 }
34193453 return PI_SUCCESS;
34203454}
@@ -3441,7 +3475,7 @@ pi_result piEventRetain(pi_event Event) {
34413475pi_result piEventRelease (pi_event Event) {
34423476 assert (Event);
34433477 if (--(Event->RefCount ) == 0 ) {
3444- recycleEventCommandList (Event);
3478+ cleanupAfterEvent (Event);
34453479
34463480 if (Event->CommandType == PI_COMMAND_TYPE_MEM_BUFFER_UNMAP &&
34473481 Event->CommandData ) {
0 commit comments