Skip to content

[UR][L0] Fix L0 teardown checks for stability #17818

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Apr 8, 2025
2 changes: 1 addition & 1 deletion unified-runtime/cmake/FetchLevelZero.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR)
set(UR_LEVEL_ZERO_LOADER_REPO "https://github.com/oneapi-src/level-zero.git")
endif()
if (UR_LEVEL_ZERO_LOADER_TAG STREQUAL "")
set(UR_LEVEL_ZERO_LOADER_TAG v1.21.1)
set(UR_LEVEL_ZERO_LOADER_TAG ecfe375b30cc04265b20ac1b7996a85d0910f3ed)
endif()

# Disable due to a bug https://github.com/oneapi-src/level-zero/issues/104
Expand Down
10 changes: 6 additions & 4 deletions unified-runtime/source/adapters/level_zero/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -445,16 +445,16 @@ void ur_exp_command_buffer_handle_t_::cleanupCommandBufferResources() {

// Release the memory allocated to the CommandList stored in the
// command_buffer
if (ZeComputeCommandList) {
if (ZeComputeCommandList && checkL0LoaderTeardown()) {
ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeComputeCommandList));
}
if (useCopyEngine() && ZeCopyCommandList) {
if (useCopyEngine() && ZeCopyCommandList && checkL0LoaderTeardown()) {
ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCopyCommandList));
}

// Release the memory allocated to the CommandListResetEvents stored in the
// command_buffer
if (ZeCommandListResetEvents) {
if (ZeCommandListResetEvents && checkL0LoaderTeardown()) {
ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandListResetEvents));
}

Expand Down Expand Up @@ -502,7 +502,9 @@ void ur_exp_command_buffer_handle_t_::cleanupCommandBufferResources() {
// Release fences allocated to command-buffer
for (auto &ZeFencePair : ZeFencesMap) {
auto &ZeFence = ZeFencePair.second;
ZE_CALL_NOCHECK(zeFenceDestroy, (ZeFence));
if (checkL0LoaderTeardown()) {
ZE_CALL_NOCHECK(zeFenceDestroy, (ZeFence));
}
}

auto ReleaseIndirectMem = [](ur_kernel_handle_t Kernel) {
Expand Down
68 changes: 8 additions & 60 deletions unified-runtime/source/adapters/level_zero/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <unistd.h>
#endif

#include <loader/ze_loader.h>
#include <ur/ur.hpp>
#include <ur_ddi.h>
#include <ze_api.h>
Expand All @@ -38,65 +39,15 @@
struct _ur_platform_handle_t;

[[maybe_unused]] static bool checkL0LoaderTeardown() {
bool loaderStable = true;
#ifdef _WIN32
uint32_t ZeDriverCount = 0;
HMODULE zeLoader = LoadLibrary("ze_loader.dll");
if (zeLoader) {
typedef ze_result_t (*zeDriverGet_t)(uint32_t *, ze_driver_handle_t *);
zeDriverGet_t zeDriverGetLoader =
(zeDriverGet_t)GetProcAddress(zeLoader, "zeDriverGet");
if (zeDriverGetLoader) {
ze_result_t result = zeDriverGetLoader(&ZeDriverCount, nullptr);
logger::debug(
"ZE ---> checkL0LoaderTeardown result = {} driver count = {}", result,
ZeDriverCount);
if (result != ZE_RESULT_SUCCESS || ZeDriverCount == 0) {
loaderStable = false;
}
} else {
logger::debug("ZE ---> checkL0LoaderTeardown: Failed to get address of "
"zeDriverGet");
loaderStable = false;
}
FreeLibrary(zeLoader);
} else {
logger::debug(
"ZE ---> checkL0LoaderTeardown: Failed to load ze_loader.dll");
loaderStable = false;
}
#else
uint32_t ZeDriverCount = 0;
void *zeLoader = dlopen("libze_loader.so.1", RTLD_LAZY);
if (zeLoader) {
typedef ze_result_t (*zeDriverGet_t)(uint32_t *, ze_driver_handle_t *);
zeDriverGet_t zeDriverGetLoader =
(zeDriverGet_t)dlsym(zeLoader, "zeDriverGet");
if (zeDriverGetLoader) {
ze_result_t result = zeDriverGetLoader(&ZeDriverCount, nullptr);
logger::debug(
"ZE ---> checkL0LoaderTeardown result = {} driver count = {}", result,
ZeDriverCount);
if (result != ZE_RESULT_SUCCESS || ZeDriverCount == 0) {
loaderStable = false;
}
} else {
logger::debug("ZE ---> checkL0LoaderTeardown: Failed to get address of "
"zeDriverGet");
loaderStable = false;
try {
if (!zelCheckIsLoaderInTearDown()) {
return true;
}
dlclose(zeLoader);
} else {
logger::debug(
"ZE ---> checkL0LoaderTeardown: Failed to load libze_loader.so.1");
loaderStable = false;
} catch (...) {
}
#endif
if (!loaderStable) {
logger::debug(
"ZE ---> checkL0LoaderTeardown: Loader is not stable, returning false");
}
return loaderStable;
logger::debug(
"ZE ---> checkL0LoaderTeardown: Loader is in teardown or is unstable");
return false;
}

// Controls UR L0 calls tracing.
Expand Down Expand Up @@ -329,9 +280,6 @@ struct _ur_object {
// Indicates if we own the native handle or it came from interop that
// asked to not transfer the ownership to SYCL RT.
bool OwnNativeHandle = false;

// Indicates if this object is an interop handle.
bool IsInteropNativeHandle = false;
};

// Record for a memory allocation. This structure is used to keep information
Expand Down
69 changes: 46 additions & 23 deletions unified-runtime/source/adapters/level_zero/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,6 @@ ur_result_t urContextCreateWithNativeHandle(
ur_context_handle_t_ *UrContext = new ur_context_handle_t_(
ZeContext, NumDevices, Devices, OwnNativeHandle);
UrContext->initialize();
UrContext->IsInteropNativeHandle = true;
*Context = reinterpret_cast<ur_context_handle_t>(UrContext);
} catch (const std::bad_alloc &) {
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
Expand Down Expand Up @@ -264,11 +263,8 @@ ur_result_t ContextReleaseHelper(ur_context_handle_t Context) {
Contexts.erase(It);
}
ze_context_handle_t DestroyZeContext =
((Context->OwnNativeHandle && !Context->IsInteropNativeHandle) ||
(Context->OwnNativeHandle && Context->IsInteropNativeHandle &&
checkL0LoaderTeardown()))
? Context->ZeContext
: nullptr;
(Context->OwnNativeHandle && checkL0LoaderTeardown()) ? Context->ZeContext
: nullptr;

// Clean up any live memory associated with Context
ur_result_t Result = Context->finalize();
Expand All @@ -285,8 +281,12 @@ ur_result_t ContextReleaseHelper(ur_context_handle_t Context) {
if (DestroyZeContext) {
auto ZeResult = ZE_CALL_NOCHECK(zeContextDestroy, (DestroyZeContext));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
return ze2urResult(ZeResult);
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
}
}

return Result;
Expand All @@ -307,12 +307,15 @@ ur_result_t ur_context_handle_t_::finalize() {
std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
for (auto &EventCache : EventCaches) {
for (auto &Event : EventCache) {
if (!Event->IsInteropNativeHandle ||
(Event->IsInteropNativeHandle && checkL0LoaderTeardown())) {
if (checkL0LoaderTeardown()) {
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
return ze2urResult(ZeResult);
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
}
}
Event->ZeEvent = nullptr;
delete Event;
Expand All @@ -324,41 +327,61 @@ ur_result_t ur_context_handle_t_::finalize() {
std::scoped_lock<ur_mutex> Lock(ZeEventPoolCacheMutex);
for (auto &ZePoolCache : ZeEventPoolCache) {
for (auto &ZePool : ZePoolCache) {
auto ZeResult = ZE_CALL_NOCHECK(zeEventPoolDestroy, (ZePool));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return ze2urResult(ZeResult);
if (checkL0LoaderTeardown()) {
auto ZeResult = ZE_CALL_NOCHECK(zeEventPoolDestroy, (ZePool));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
return ze2urResult(ZeResult);
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
}
}
}
ZePoolCache.clear();
}
}

// Destroy the command list used for initializations
auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandListInit));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return ze2urResult(ZeResult);
if (checkL0LoaderTeardown()) {
// Destroy the command list used for initializations
auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandListInit));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
return ze2urResult(ZeResult);
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
}
}

std::scoped_lock<ur_mutex> Lock(ZeCommandListCacheMutex);
for (auto &List : ZeComputeCommandListCache) {
for (auto &Item : List.second) {
ze_command_list_handle_t ZeCommandList = Item.first;
if (ZeCommandList) {
if (ZeCommandList && checkL0LoaderTeardown()) {
auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
return ze2urResult(ZeResult);
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
}
}
}
}
for (auto &List : ZeCopyCommandListCache) {
for (auto &Item : List.second) {
ze_command_list_handle_t ZeCommandList = Item.first;
if (ZeCommandList) {
if (ZeCommandList && checkL0LoaderTeardown()) {
auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
return ze2urResult(ZeResult);
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
}
}
}
}
Expand Down
1 change: 0 additions & 1 deletion unified-runtime/source/adapters/level_zero/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1536,7 +1536,6 @@ ur_result_t urDeviceCreateWithNativeHandle(
if (Dev == nullptr)
return UR_RESULT_ERROR_INVALID_VALUE;

Dev->IsInteropNativeHandle = true;
*Device = Dev;
return UR_RESULT_SUCCESS;
}
Expand Down
12 changes: 7 additions & 5 deletions unified-runtime/source/adapters/level_zero/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1001,7 +1001,6 @@ ur_result_t urEventCreateWithNativeHandle(
UREvent->CleanedUp = true;

*Event = reinterpret_cast<ur_event_handle_t>(UREvent);
UREvent->IsInteropNativeHandle = true;

return UR_RESULT_SUCCESS;
}
Expand Down Expand Up @@ -1090,7 +1089,7 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
* leaks or resource mismanagement.
*/
ur_event_handle_t_::~ur_event_handle_t_() {
if (this->ZeEvent && this->Completed) {
if (this->ZeEvent && this->Completed && checkL0LoaderTeardown()) {
if (this->UrQueue && !this->UrQueue->isDiscardEvents())
ZE_CALL_NOCHECK(zeEventDestroy, (this->ZeEvent));
}
Expand Down Expand Up @@ -1121,12 +1120,15 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
}
if (Event->OwnNativeHandle) {
if (DisableEventsCaching) {
if (!Event->IsInteropNativeHandle ||
(Event->IsInteropNativeHandle && checkL0LoaderTeardown())) {
if (checkL0LoaderTeardown()) {
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
return ze2urResult(ZeResult);
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
}
}
Event->ZeEvent = nullptr;
auto Context = Event->Context;
Expand Down
4 changes: 3 additions & 1 deletion unified-runtime/source/adapters/level_zero/image.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,9 @@ ur_result_t urBindlessImagesUnsampledImageHandleDestroyExp(
auto item = hDevice->ZeOffsetToImageHandleMap.find(hImage);

if (item != hDevice->ZeOffsetToImageHandleMap.end()) {
ZE2UR_CALL(zeImageDestroy, (item->second));
if (checkL0LoaderTeardown()) {
ZE2UR_CALL(zeImageDestroy, (item->second));
}
hDevice->ZeOffsetToImageHandleMap.erase(item);
} else {
return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
Expand Down
10 changes: 6 additions & 4 deletions unified-runtime/source/adapters/level_zero/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -940,12 +940,15 @@ ur_result_t urKernelRelease(
auto KernelProgram = Kernel->Program;
if (Kernel->OwnNativeHandle) {
for (auto &ZeKernel : Kernel->ZeKernels) {
if (!Kernel->IsInteropNativeHandle ||
(Kernel->IsInteropNativeHandle && checkL0LoaderTeardown())) {
if (checkL0LoaderTeardown()) {
auto ZeResult = ZE_CALL_NOCHECK(zeKernelDestroy, (ZeKernel));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
return ze2urResult(ZeResult);
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
}
}
}
}
Expand Down Expand Up @@ -1157,7 +1160,6 @@ ur_result_t urKernelCreateWithNativeHandle(
}

Kernel->Program = Program;
Kernel->IsInteropNativeHandle = true;

UR_CALL(Kernel->initialize());

Expand Down
11 changes: 6 additions & 5 deletions unified-runtime/source/adapters/level_zero/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1563,7 +1563,6 @@ ur_result_t urMemImageCreateWithNativeHandle(
auto OwnNativeHandle = Properties ? Properties->isNativeHandleOwned : false;
UR_CALL(createUrMemFromZeImage(Context, ZeHImage, OwnNativeHandle,
ZeImageDesc, Mem));
(*Mem)->IsInteropNativeHandle = true;

return UR_RESULT_SUCCESS;
}
Expand Down Expand Up @@ -1663,13 +1662,16 @@ ur_result_t urMemRelease(
if (Image->OwnNativeHandle) {
UR_CALL(Mem->getZeHandle(ZeHandleImage, ur_mem_handle_t_::write_only,
nullptr, nullptr, 0u));
if (!Image->IsInteropNativeHandle ||
(Image->IsInteropNativeHandle && checkL0LoaderTeardown())) {
if (checkL0LoaderTeardown()) {
auto ZeResult = ZE_CALL_NOCHECK(
zeImageDestroy, (ur_cast<ze_image_handle_t>(ZeHandleImage)));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
return ze2urResult(ZeResult);
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
}
}
}
delete Image;
Expand Down Expand Up @@ -1776,7 +1778,6 @@ ur_result_t urMemBufferCreateWithNativeHandle(
Buffer = new _ur_buffer(Context, Size, Device, ur_cast<char *>(NativeMem),
OwnNativeHandle);
*Mem = reinterpret_cast<ur_mem_handle_t>(Buffer);
(*Mem)->IsInteropNativeHandle = true;
} catch (const std::bad_alloc &) {
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
} catch (...) {
Expand Down
6 changes: 4 additions & 2 deletions unified-runtime/source/adapters/level_zero/physical_mem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,10 @@ ur_result_t urPhysicalMemRelease(ur_physical_mem_handle_t hPhysicalMem) {
if (!hPhysicalMem->RefCount.decrementAndTest())
return UR_RESULT_SUCCESS;

ZE2UR_CALL(zePhysicalMemDestroy, (hPhysicalMem->Context->getZeHandle(),
hPhysicalMem->ZePhysicalMem));
if (checkL0LoaderTeardown()) {
ZE2UR_CALL(zePhysicalMemDestroy, (hPhysicalMem->Context->getZeHandle(),
hPhysicalMem->ZePhysicalMem));
}
delete hPhysicalMem;

return UR_RESULT_SUCCESS;
Expand Down
Loading