Skip to content
130 changes: 82 additions & 48 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ pi_result _pi_device::initialize() {
pi_result
_pi_queue::resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList,
bool MakeAvailable) {
// Event has been signaled: If the fence for the associated command list
// Event has been signalled: If the fence for the associated command list
// is signalled, then reset the fence and command list and add them to the
// available list for ruse in PI calls.
ZE_CALL(zeFenceReset(this->ZeCommandListFenceMap[ZeCommandList]));
Expand Down Expand Up @@ -552,28 +552,9 @@ pi_result _pi_device::getAvailableCommandList(

pi_result _pi_queue::executeCommandList(ze_command_list_handle_t ZeCommandList,
ze_fence_handle_t ZeFence,
bool IsBlocking) {
// Close the command list and have it ready for dispatch.
ZE_CALL(zeCommandListClose(ZeCommandList));
// Offload command list to the GPU for asynchronous execution
ZE_CALL(zeCommandQueueExecuteCommandLists(ZeCommandQueue, 1, &ZeCommandList,
ZeFence));

// Check global control to make every command blocking for debugging.
if (IsBlocking || (ZeSerialize & ZeSerializeBlock) != 0) {
// Wait until command lists attached to the command queue are executed.
ZE_CALL(zeCommandQueueSynchronize(ZeCommandQueue, UINT32_MAX));
}
return PI_SUCCESS;
}

bool _pi_queue::isBatchingAllowed() {
return (this->QueueBatchSize > 1 && ((ZeSerialize & ZeSerializeBlock) == 0));
}

pi_result _pi_queue::batchCommandList(ze_command_list_handle_t ZeCommandList,
ze_fence_handle_t ZeFence) {
if (this->isBatchingAllowed()) {
bool IsBlocking,
bool OKToBatchCommand) {
if (OKToBatchCommand && this->isBatchingAllowed()) {
assert(this->ZeOpenCommandList == nullptr ||
this->ZeOpenCommandList == ZeCommandList);

Expand All @@ -596,7 +577,22 @@ pi_result _pi_queue::batchCommandList(ze_command_list_handle_t ZeCommandList,
this->ZeOpenCommandListSize = 0;
}

return executeCommandList(ZeCommandList, ZeFence);
// Close the command list and have it ready for dispatch.
ZE_CALL(zeCommandListClose(ZeCommandList));
// Offload command list to the GPU for asynchronous execution
ZE_CALL(zeCommandQueueExecuteCommandLists(ZeCommandQueue, 1, &ZeCommandList,
ZeFence));

// Check global control to make every command blocking for debugging.
if (IsBlocking || (ZeSerialize & ZeSerializeBlock) != 0) {
// Wait until command lists attached to the command queue are executed.
ZE_CALL(zeCommandQueueSynchronize(ZeCommandQueue, UINT32_MAX));
}
return PI_SUCCESS;
}

bool _pi_queue::isBatchingAllowed() {
return (this->QueueBatchSize > 1 && ((ZeSerialize & ZeSerializeBlock) == 0));
}

pi_result _pi_queue::executeOpenCommandList() {
Expand Down Expand Up @@ -2759,12 +2755,16 @@ pi_result piextProgramCreateWithNativeHandle(pi_native_handle NativeHandle,
}

_pi_program::~_pi_program() {
if (ZeModule) {
ZE_CALL_NOCHECK(zeModuleDestroy(ZeModule));
}
// According to Level Zero Specification, all kernels and build logs
// must be destroyed before the Module can be destroyed. So, be sure
// to destroy build log before destroying the module.
if (ZeBuildLog) {
ZE_CALL_NOCHECK(zeModuleBuildLogDestroy(ZeBuildLog));
}

if (ZeModule) {
ZE_CALL_NOCHECK(zeModuleDestroy(ZeModule));
}
}

_pi_program::LinkedReleaser::~LinkedReleaser() {
Expand Down Expand Up @@ -2902,6 +2902,10 @@ pi_result piKernelCreate(pi_program Program, const char *KernelName,
} catch (...) {
return PI_ERROR_UNKNOWN;
}

// Update the refcount of the program to show its use by this kernel.
piProgramRetain(Program);

return PI_SUCCESS;
}

Expand Down Expand Up @@ -3091,16 +3095,24 @@ pi_result piKernelRetain(pi_kernel Kernel) {

assert(Kernel);
++(Kernel->RefCount);
// When retaining a kernel, you are also retaining the program it is part of.
piProgramRetain(Kernel->Program);
return PI_SUCCESS;
}

pi_result piKernelRelease(pi_kernel Kernel) {

assert(Kernel);
auto KernelProgram = Kernel->Program;

if (--(Kernel->RefCount) == 0) {
zeKernelDestroy(Kernel->ZeKernel);
delete Kernel;
}

// do a release on the program this kernel was part of
piProgramRelease(KernelProgram);

return PI_SUCCESS;
}

Expand All @@ -3112,6 +3124,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
const pi_event *EventWaitList, pi_event *Event) {
assert(Kernel);
assert(Queue);
assert(Event);
assert((WorkDim > 0) && (WorkDim < 4));
if (GlobalWorkOffset != NULL) {
for (pi_uint32 i = 0; i < WorkDim; i++) {
Expand Down Expand Up @@ -3194,17 +3207,26 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
return Res;

ze_event_handle_t ZeEvent = nullptr;
if (Event) {
auto Res = piEventCreate(Kernel->Program->Context, Event);
if (Res != PI_SUCCESS)
return Res;
auto Res = piEventCreate(Kernel->Program->Context, Event);
if (Res != PI_SUCCESS)
return Res;

(*Event)->Queue = Queue;
(*Event)->CommandType = PI_COMMAND_TYPE_NDRANGE_KERNEL;
(*Event)->ZeCommandList = ZeCommandList;
(*Event)->Queue = Queue;
(*Event)->CommandType = PI_COMMAND_TYPE_NDRANGE_KERNEL;
(*Event)->ZeCommandList = ZeCommandList;

ZeEvent = (*Event)->ZeEvent;
}
// Save the kernel in the event, so that when the event is signalled
// the code can do a piKernelRelease on this kernel.
(*Event)->CommandData = (void *)Kernel;

// Use piKernelRetain to increment the reference count and indicate
// that the Kernel is in use. Once the event has been signalled, the
// code in cleanupAfterEvent will do a piReleaseKernel to update
// the reference count on the kernel, using the kernel saved
// in CommandData.
piKernelRetain(Kernel);

ZeEvent = (*Event)->ZeEvent;

ze_event_handle_t *ZeEventWaitList =
_pi_event::createZeEventList(NumEventsInWaitList, EventWaitList);
Expand All @@ -3227,7 +3249,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,

// Execute command list asynchronously, as the event will be used
// to track down its completion.
if (auto Res = Queue->batchCommandList(ZeCommandList, ZeFence))
if (auto Res = Queue->executeCommandList(ZeCommandList, ZeFence, false, true))
return Res;

_pi_event::deleteZeEventList(ZeEventWaitList);
Expand Down Expand Up @@ -3356,25 +3378,30 @@ pi_result piEventGetProfilingInfo(pi_event Event, pi_profiling_info ParamName,
return PI_SUCCESS;
}

// Recycle the command list associated with this event.
static void recycleEventCommandList(pi_event Event) {
// Perform any necessary cleanup after an event has been signalled.
// This currently recycles the associate command list, and also makes
// sure to release any kernel that may have been used by the event.
static void cleanupAfterEvent(pi_event Event) {
// The implementation of this is slightly tricky. The same event
// can be referred to by multiple threads, so it is possible to
// have a race condition between the read of ZeCommandList and
// it being reset to nullptr in another thread.
// But, since the ZeCommandList is uniquely associated with the queue
// have a race condition between the read of fields of the event,
// and reseting those fields in some other thread.
// But, since the event is uniquely associated with the queue
// for the event, we use the locking that we already have to do on the
// queue to also serve as the thread safety mechanism for the
// Event's ZeCommandList.
// any of the Event's data members that need to be read/reset as
// part of the cleanup operations.
auto Queue = Event->Queue;

// Lock automatically releases when this goes out of scope.
std::lock_guard<std::mutex> lock(Queue->PiQueueMutex);

// Cleanup the command list associated with the event if it hasn't
// been cleaned up already.
auto EventCommandList = Event->ZeCommandList;

if (EventCommandList) {
// Event has been signaled: If the fence for the associated command list
// Event has been signalled: If the fence for the associated command list
// is signalled, then reset the fence and command list and add them to the
// available list for reuse in PI calls.
if (Queue->RefCount > 0) {
Expand All @@ -3386,6 +3413,13 @@ static void recycleEventCommandList(pi_event Event) {
}
}
}

// Release the kernel associated with this event if there is one.
if (Event->CommandType == PI_COMMAND_TYPE_NDRANGE_KERNEL &&
Event->CommandData) {
piKernelRelease(pi_cast<pi_kernel>(Event->CommandData));
Event->CommandData = nullptr;
}
}

pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
Expand All @@ -3412,9 +3446,9 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
zePrint("ZeEvent = %lx\n", pi_cast<std::uintptr_t>(ZeEvent));
ZE_CALL(zeEventHostSynchronize(ZeEvent, UINT32_MAX));

// NOTE: we are destroying associated command lists here to free
// resources sooner in case RT is not calling piEventRelease soon enough.
recycleEventCommandList(EventList[I]);
// NOTE: we are cleaning up after the event here to free resources
// sooner in case run-time is not calling piEventRelease soon enough.
cleanupAfterEvent(EventList[I]);
}
return PI_SUCCESS;
}
Expand All @@ -3441,7 +3475,7 @@ pi_result piEventRetain(pi_event Event) {
pi_result piEventRelease(pi_event Event) {
assert(Event);
if (--(Event->RefCount) == 0) {
recycleEventCommandList(Event);
cleanupAfterEvent(Event);

if (Event->CommandType == PI_COMMAND_TYPE_MEM_BUFFER_UNMAP &&
Event->CommandData) {
Expand Down
16 changes: 8 additions & 8 deletions sycl/plugins/level_zero/pi_level_zero.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,20 +325,20 @@ struct _pi_queue : _pi_object {
pi_result resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList,
bool MakeAvailable);

// Attach a command list to this queue and allow it to remain open
// and used for further batching. It may be executed immediately,
// or it may be left open for other future command to be batched into.
pi_result batchCommandList(ze_command_list_handle_t ZeCommandList,
ze_fence_handle_t ZeFence);

// Attach a command list to this queue, close, and execute it.
// Note that this command list cannot be appended to after this.
// The "IsBlocking" tells if the wait for completion is requested.
// The "IsBlocking" tells if the wait for completion is required.
// The "ZeFence" passed is used to track when the command list passed
// has completed execution on the device and can be reused.
// If OKToBatchCommand is true, then this command list may be executed
// immediately, or it may be left open for other future command to be
// batched into.
// If IsBlocking is true, then batching will not be allowed regardless
// of the value of OKToBatchCommand
pi_result executeCommandList(ze_command_list_handle_t ZeCommandList,
ze_fence_handle_t ZeFence,
bool IsBlocking = false);
bool IsBlocking = false,
bool OKToBatchCommand = false);

// If there is an open command list associated with this queue,
// close it, exceute it, and reset ZeOpenCommandList, ZeCommandListFence,
Expand Down