diff --git a/CMakeLists.txt b/CMakeLists.txt index 7564724176..d04d8d5dae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception cmake_minimum_required(VERSION 3.20.0 FATAL_ERROR) -project(unified-runtime VERSION 0.10.8) +project(unified-runtime VERSION 0.10.9) # Check if unified runtime is built as a standalone project. if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR UR_STANDALONE_BUILD) diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 2fdb6b08a3..444972dac2 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -354,14 +354,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - CUgraphNode GraphNode; + try { + CUgraphNode GraphNode; - std::vector DepsList; - UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList)); + std::vector DepsList; + UR_CHECK_ERROR(getNodesFromSyncPoints( + hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList)); - if (*pGlobalWorkSize == 0) { - try { + if (*pGlobalWorkSize == 0) { // Create an empty node if the kernel workload size is zero UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size())); @@ -371,25 +371,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( if (pSyncPoint) { *pSyncPoint = SyncPoint; } - } catch (ur_result_t Err) { - return Err; + return UR_RESULT_SUCCESS; } - return UR_RESULT_SUCCESS; - } - // Set the number of threads per block to the number of threads per warp - // by default unless user has provided a better number - size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; - size_t BlocksPerGrid[3] = {1u, 1u, 1u}; + // Set the number of threads per block to the number of threads per warp + // by default unless user has provided a better number + size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; + size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - uint32_t LocalSize = hKernel->getLocalSize(); - CUfunction CuFunc = hKernel->get(); - UR_CHECK_ERROR( - setKernelParams(hCommandBuffer->Context, hCommandBuffer->Device, workDim, - pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid)); + uint32_t LocalSize = hKernel->getLocalSize(); + CUfunction CuFunc = hKernel->get(); + UR_CHECK_ERROR(setKernelParams( + hCommandBuffer->Context, hCommandBuffer->Device, workDim, + pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, hKernel, CuFunc, + ThreadsPerBlock, BlocksPerGrid)); - try { // Set node param structure with the kernel related data auto &ArgIndices = hKernel->getArgIndices(); CUDA_KERNEL_NODE_PARAMS NodeParams = {}; diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp index 4ff38626af..80064a0d80 100644 --- a/source/adapters/hip/command_buffer.cpp +++ b/source/adapters/hip/command_buffer.cpp @@ -324,14 +324,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - hipGraphNode_t GraphNode; - std::vector DepsList; + try { + hipGraphNode_t GraphNode; + std::vector DepsList; - UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList)); + UR_CHECK_ERROR(getNodesFromSyncPoints( + hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList)); - if (*pGlobalWorkSize == 0) { - try { + if (*pGlobalWorkSize == 0) { // Create an empty node if the kernel workload size is zero UR_CHECK_ERROR(hipGraphAddEmptyNode(&GraphNode, hCommandBuffer->HIPGraph, DepsList.data(), DepsList.size())); @@ -341,24 +341,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( if (pSyncPoint) { *pSyncPoint = SyncPoint; } - } catch (ur_result_t Err) { - return Err; + return UR_RESULT_SUCCESS; } - return UR_RESULT_SUCCESS; - } - // Set the number of threads per block to the number of threads per warp - // by default unless user has provided a better number - size_t ThreadsPerBlock[3] = {64u, 1u, 1u}; - size_t BlocksPerGrid[3] = {1u, 1u, 1u}; + // Set the number of threads per block to the number of threads per warp + // by default unless user has provided a better number + size_t ThreadsPerBlock[3] = {64u, 1u, 1u}; + size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - uint32_t LocalSize = hKernel->getLocalSize(); - hipFunction_t HIPFunc = hKernel->get(); - UR_CHECK_ERROR(setKernelParams( - hCommandBuffer->Device, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid)); + uint32_t LocalSize = hKernel->getLocalSize(); + hipFunction_t HIPFunc = hKernel->get(); + UR_CHECK_ERROR(setKernelParams( + hCommandBuffer->Device, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid)); - try { // Set node param structure with the kernel related data auto &ArgIndices = hKernel->getArgIndices(); hipKernelNodeParams NodeParams; diff --git a/source/adapters/level_zero/adapter.cpp b/source/adapters/level_zero/adapter.cpp index 5ae1d52e7b..d35d999d7b 100644 --- a/source/adapters/level_zero/adapter.cpp +++ b/source/adapters/level_zero/adapter.cpp @@ -35,7 +35,32 @@ class ur_legacy_sink : public logger::Sink { ~ur_legacy_sink() = default; }; -ur_result_t initPlatforms(PlatformVec &platforms) noexcept try { +// Find the corresponding ZesDevice Handle for a given ZeDevice +ur_result_t getZesDeviceHandle(zes_uuid_t coreDeviceUuid, + zes_device_handle_t *ZesDevice, + uint32_t *SubDeviceId, ze_bool_t *SubDevice) { + uint32_t ZesDriverCount = 0; + std::vector ZesDrivers; + std::vector ZesDevices; + ze_result_t ZesResult = ZE_RESULT_ERROR_INVALID_ARGUMENT; + ZE2UR_CALL(GlobalAdapter->getSysManDriversFunctionPtr, + (&ZesDriverCount, nullptr)); + ZesDrivers.resize(ZesDriverCount); + ZE2UR_CALL(GlobalAdapter->getSysManDriversFunctionPtr, + (&ZesDriverCount, ZesDrivers.data())); + for (uint32_t I = 0; I < ZesDriverCount; ++I) { + ZesResult = ZE_CALL_NOCHECK( + GlobalAdapter->getDeviceByUUIdFunctionPtr, + (ZesDrivers[I], coreDeviceUuid, ZesDevice, SubDevice, SubDeviceId)); + if (ZesResult == ZE_RESULT_SUCCESS) { + return UR_RESULT_SUCCESS; + } + } + return UR_RESULT_ERROR_INVALID_ARGUMENT; +} + +ur_result_t initPlatforms(PlatformVec &platforms, + ze_result_t ZesResult) noexcept try { uint32_t ZeDriverCount = 0; ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, nullptr)); if (ZeDriverCount == 0) { @@ -48,24 +73,43 @@ ur_result_t initPlatforms(PlatformVec &platforms) noexcept try { ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data())); for (uint32_t I = 0; I < ZeDriverCount; ++I) { + // Keep track of the first platform init for this Driver + bool DriverPlatformInit = false; ze_device_properties_t device_properties{}; device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; uint32_t ZeDeviceCount = 0; ZE2UR_CALL(zeDeviceGet, (ZeDrivers[I], &ZeDeviceCount, nullptr)); ZeDevices.resize(ZeDeviceCount); ZE2UR_CALL(zeDeviceGet, (ZeDrivers[I], &ZeDeviceCount, ZeDevices.data())); + auto platform = std::make_unique(ZeDrivers[I]); // Check if this driver has GPU Devices for (uint32_t D = 0; D < ZeDeviceCount; ++D) { ZE2UR_CALL(zeDeviceGetProperties, (ZeDevices[D], &device_properties)); - if (ZE_DEVICE_TYPE_GPU == device_properties.type) { - // If this Driver is a GPU, save it as a usable platform. - auto platform = std::make_unique(ZeDrivers[I]); - UR_CALL(platform->initialize()); - - // Save a copy in the cache for future uses. - platforms.push_back(std::move(platform)); - break; + // Check if this driver's platform has already been init. + if (!DriverPlatformInit) { + // If this Driver is a GPU, save it as a usable platform. + UR_CALL(platform->initialize()); + + // Save a copy in the cache for future uses. + platforms.push_back(std::move(platform)); + // Mark this driver's platform as init to prevent additional platforms + // from being created per driver. + DriverPlatformInit = true; + } + if (ZesResult == ZE_RESULT_SUCCESS) { + // Populate the Zes/Ze device mapping for this Ze Device into the last + // added platform which represents the current driver being queried. + ur_zes_device_handle_data_t ZesDeviceData; + zes_uuid_t ZesUUID; + std::memcpy(&ZesUUID, &device_properties.uuid, sizeof(zes_uuid_t)); + if (getZesDeviceHandle( + ZesUUID, &ZesDeviceData.ZesDevice, &ZesDeviceData.SubDeviceId, + &ZesDeviceData.SubDevice) == UR_RESULT_SUCCESS) { + platforms.back()->ZedeviceToZesDeviceMap.insert( + std::make_pair(ZeDevices[D], std::move(ZesDeviceData))); + } + } } } } @@ -147,8 +191,36 @@ ur_adapter_handle_t_::ur_adapter_handle_t_() return; } + // Dynamically load the new L0 SysMan separate init and new EXP apis + // separately. This must be done to avoid attempting to use symbols that do + // not exist in older loader runtimes. +#ifdef _WIN32 + HMODULE processHandle = GetModuleHandle(NULL); +#else + HMODULE processHandle = nullptr; +#endif + GlobalAdapter->getDeviceByUUIdFunctionPtr = + (zes_pfnDriverGetDeviceByUuidExp_t)ur_loader::LibLoader::getFunctionPtr( + processHandle, "zesDriverGetDeviceByUuidExp"); + GlobalAdapter->getSysManDriversFunctionPtr = + (zes_pfnDriverGet_t)ur_loader::LibLoader::getFunctionPtr( + processHandle, "zesDriverGet"); + GlobalAdapter->sysManInitFunctionPtr = + (zes_pfnInit_t)ur_loader::LibLoader::getFunctionPtr(processHandle, + "zesInit"); + if (GlobalAdapter->getDeviceByUUIdFunctionPtr && + GlobalAdapter->getSysManDriversFunctionPtr && + GlobalAdapter->sysManInitFunctionPtr) { + ze_init_flags_t L0ZesInitFlags = 0; + logger::debug("\nzesInit with flags value of {}\n", + static_cast(L0ZesInitFlags)); + GlobalAdapter->ZesResult = ZE_CALL_NOCHECK( + GlobalAdapter->sysManInitFunctionPtr, (L0ZesInitFlags)); + } else { + GlobalAdapter->ZesResult = ZE_RESULT_ERROR_UNINITIALIZED; + } - ur_result_t err = initPlatforms(platforms); + ur_result_t err = initPlatforms(platforms, *GlobalAdapter->ZesResult); if (err == UR_RESULT_SUCCESS) { result = std::move(platforms); } else { diff --git a/source/adapters/level_zero/adapter.hpp b/source/adapters/level_zero/adapter.hpp index 273cdb4193..53a58793e5 100644 --- a/source/adapters/level_zero/adapter.hpp +++ b/source/adapters/level_zero/adapter.hpp @@ -11,11 +11,13 @@ #include "logger/ur_logger.hpp" #include +#include #include #include #include #include #include +#include using PlatformVec = std::vector>; @@ -26,7 +28,12 @@ struct ur_adapter_handle_t_ { std::atomic RefCount = 0; std::mutex Mutex; + zes_pfnDriverGetDeviceByUuidExp_t getDeviceByUUIdFunctionPtr = nullptr; + zes_pfnDriverGet_t getSysManDriversFunctionPtr = nullptr; + zes_pfnInit_t sysManInitFunctionPtr = nullptr; + std::optional ZeResult; + std::optional ZesResult; ZeCache> PlatformCache; logger::Logger &logger; }; diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index ff4f0b56bc..61fd89d8e3 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -797,7 +797,7 @@ setKernelPendingArguments(ur_exp_command_buffer_handle_t CommandBuffer, char **ZeHandlePtr = nullptr; if (Arg.Value) { UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); } ZE2UR_CALL(zeKernelSetArgumentValue, (Kernel->ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr)); @@ -950,10 +950,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( char *ZeHandleSrc; UR_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); char *ZeHandleDst; UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost); @@ -982,10 +982,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( char *ZeHandleSrc; UR_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); char *ZeHandleDst; UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost); @@ -1008,7 +1008,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( char *ZeHandleDst = nullptr; UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); // Always prefer copy engine for writes bool PreferCopyEngine = true; @@ -1032,7 +1032,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( char *ZeHandleDst = nullptr; UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); // Always prefer copy engine for writes bool PreferCopyEngine = true; @@ -1054,7 +1054,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( char *ZeHandleSrc = nullptr; UR_CALL(Buffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); // Always prefer copy engine for reads bool PreferCopyEngine = true; @@ -1077,7 +1077,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( char *ZeHandleSrc; UR_CALL(Buffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); // Always prefer copy engine for reads bool PreferCopyEngine = true; @@ -1202,7 +1202,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( char *ZeHandleDst = nullptr; _ur_buffer *UrBuffer = reinterpret_cast<_ur_buffer *>(Buffer); UR_CALL(UrBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); return enqueueCommandBufferFillHelper( UR_COMMAND_MEM_BUFFER_FILL, CommandBuffer, ZeHandleDst + Offset, @@ -1654,7 +1654,7 @@ ur_result_t updateKernelCommand( char **ZeHandlePtr = nullptr; if (NewMemObjArg) { UR_CALL(NewMemObjArg->getZeHandlePtr(ZeHandlePtr, UrAccessMode, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); } auto ZeMutableArgDesc = diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index 08f13268eb..b3a60e78c5 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -701,11 +701,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( } case UR_DEVICE_INFO_GLOBAL_MEM_FREE: { - if (getenv("ZES_ENABLE_SYSMAN") == nullptr) { - setErrorMessage("Set ZES_ENABLE_SYSMAN=1 to obtain free memory", - UR_RESULT_ERROR_UNINITIALIZED, - static_cast(ZE_RESULT_ERROR_UNINITIALIZED)); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + bool SysManEnv = getenv_tobool("ZES_ENABLE_SYSMAN", false); + if ((Device->Platform->ZedeviceToZesDeviceMap.size() == 0) && !SysManEnv) { + logger::error("SysMan support is unavailable on this system. Please " + "check your level zero driver installation."); + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; } // Calculate the global memory size as the max limit that can be reported as // "free" memory for the user to allocate. @@ -714,30 +714,57 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( // Currently this is only the one enumerated with ordinal 0. uint64_t FreeMemory = 0; uint32_t MemCount = 0; - ZE2UR_CALL(zesDeviceEnumMemoryModules, (ZeDevice, &MemCount, nullptr)); + + zes_device_handle_t ZesDevice = Device->ZeDevice; + struct ur_zes_device_handle_data_t ZesDeviceData = {}; + // If legacy sysman is enabled thru the environment variable, then zesInit + // will fail, but sysman is still usable so go the legacy route. + if (!SysManEnv) { + auto It = Device->Platform->ZedeviceToZesDeviceMap.find(Device->ZeDevice); + if (It == Device->Platform->ZedeviceToZesDeviceMap.end()) { + // no matching device + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } else { + ZesDeviceData = + Device->Platform->ZedeviceToZesDeviceMap[Device->ZeDevice]; + ZesDevice = ZesDeviceData.ZesDevice; + } + } + + ZE2UR_CALL(zesDeviceEnumMemoryModules, (ZesDevice, &MemCount, nullptr)); if (MemCount != 0) { std::vector ZesMemHandles(MemCount); ZE2UR_CALL(zesDeviceEnumMemoryModules, - (ZeDevice, &MemCount, ZesMemHandles.data())); + (ZesDevice, &MemCount, ZesMemHandles.data())); for (auto &ZesMemHandle : ZesMemHandles) { ZesStruct ZesMemProperties; ZE2UR_CALL(zesMemoryGetProperties, (ZesMemHandle, &ZesMemProperties)); // For root-device report memory from all memory modules since that // is what totally available in the default implicit scaling mode. // For sub-devices only report memory local to them. - if (!Device->isSubDevice() || Device->ZeDeviceProperties->subdeviceId == - ZesMemProperties.subdeviceId) { - - ZesStruct ZesMemState; - ZE2UR_CALL(zesMemoryGetState, (ZesMemHandle, &ZesMemState)); - FreeMemory += ZesMemState.free; + if (SysManEnv) { + if (!Device->isSubDevice() || + Device->ZeDeviceProperties->subdeviceId == + ZesMemProperties.subdeviceId) { + + ZesStruct ZesMemState; + ZE2UR_CALL(zesMemoryGetState, (ZesMemHandle, &ZesMemState)); + FreeMemory += ZesMemState.free; + } + } else { + if (ZesDeviceData.SubDeviceId == ZesMemProperties.subdeviceId || + !ZesDeviceData.SubDevice) { + ZesStruct ZesMemState; + ZE2UR_CALL(zesMemoryGetState, (ZesMemHandle, &ZesMemState)); + FreeMemory += ZesMemState.free; + } } } } if (MemCount > 0) { return ReturnValue(std::min(GlobalMemSize, FreeMemory)); } else { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; } } case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: { diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index 44079d1c3d..0d66bd7a50 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -158,7 +158,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueKernelLaunch( char **ZeHandlePtr = nullptr; if (Arg.Value) { UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); } ZE2UR_CALL(zeKernelSetArgumentValue, (ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr)); @@ -377,7 +378,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueCooperativeKernelLaunchExp( char **ZeHandlePtr = nullptr; if (Arg.Value) { UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); } ZE2UR_CALL(zeKernelSetArgumentValue, (ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr)); diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp index 585a10ef4f..b23f0eb960 100644 --- a/source/adapters/level_zero/memory.cpp +++ b/source/adapters/level_zero/memory.cpp @@ -406,7 +406,8 @@ static ur_result_t enqueueMemImageCommandHelper( char *ZeHandleSrc = nullptr; UR_CALL(SrcMem->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); ZE2UR_CALL(zeCommandListAppendImageCopyToMemory, (ZeCommandList, Dst, ur_cast(ZeHandleSrc), &ZeSrcRegion, ZeEvent, WaitList.Length, WaitList.ZeEventList)); @@ -439,7 +440,8 @@ static ur_result_t enqueueMemImageCommandHelper( char *ZeHandleDst = nullptr; UR_CALL(DstMem->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); ZE2UR_CALL(zeCommandListAppendImageCopyFromMemory, (ZeCommandList, ur_cast(ZeHandleDst), Src, &ZeDstRegion, ZeEvent, WaitList.Length, WaitList.ZeEventList)); @@ -457,9 +459,11 @@ static ur_result_t enqueueMemImageCommandHelper( char *ZeHandleSrc = nullptr; char *ZeHandleDst = nullptr; UR_CALL(SrcImage->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); UR_CALL(DstImage->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); ZE2UR_CALL(zeCommandListAppendImageCopyRegion, (ZeCommandList, ur_cast(ZeHandleDst), ur_cast(ZeHandleSrc), &ZeDstRegion, @@ -501,7 +505,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferRead( char *ZeHandleSrc = nullptr; UR_CALL(Src->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - Queue->Device)); + Queue->Device, phEventWaitList, + numEventsInWaitList)); return enqueueMemCopyHelper(UR_COMMAND_MEM_BUFFER_READ, Queue, pDst, blockingRead, size, ZeHandleSrc + offset, numEventsInWaitList, phEventWaitList, phEvent, @@ -536,7 +541,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferWrite( char *ZeHandleDst = nullptr; UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); + Queue->Device, phEventWaitList, + numEventsInWaitList)); return enqueueMemCopyHelper(UR_COMMAND_MEM_BUFFER_WRITE, Queue, ZeHandleDst + offset, // dst blockingWrite, size, @@ -582,7 +588,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferReadRect( char *ZeHandleSrc; UR_CALL(Buffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - Queue->Device)); + Queue->Device, phEventWaitList, + numEventsInWaitList)); return enqueueMemCopyRectHelper( UR_COMMAND_MEM_BUFFER_READ_RECT, Queue, ZeHandleSrc, pDst, bufferOffset, hostOffset, region, bufferRowPitch, hostRowPitch, bufferSlicePitch, @@ -628,7 +635,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferWriteRect( char *ZeHandleDst = nullptr; UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); + Queue->Device, phEventWaitList, + numEventsInWaitList)); return enqueueMemCopyRectHelper( UR_COMMAND_MEM_BUFFER_WRITE_RECT, Queue, const_cast(static_cast(pSrc)), ZeHandleDst, @@ -676,10 +684,12 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferCopy( char *ZeHandleSrc = nullptr; UR_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); char *ZeHandleDst = nullptr; UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); return enqueueMemCopyHelper( UR_COMMAND_MEM_BUFFER_COPY, Queue, ZeHandleDst + DstOffset, @@ -735,10 +745,12 @@ ur_queue_handle_legacy_t_::enqueueMemBufferCopyRect( ///< [in] handle of the char *ZeHandleSrc = nullptr; UR_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); char *ZeHandleDst = nullptr; UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); return enqueueMemCopyRectHelper( UR_COMMAND_MEM_BUFFER_COPY_RECT, Queue, ZeHandleSrc, ZeHandleDst, @@ -773,7 +785,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferFill( char *ZeHandleDst = nullptr; _ur_buffer *UrBuffer = reinterpret_cast<_ur_buffer *>(Buffer); UR_CALL(UrBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); return enqueueMemFillHelper( UR_COMMAND_MEM_BUFFER_FILL, Queue, ZeHandleDst + Offset, Pattern, // It will be interpreted as an 8-bit value, @@ -973,7 +986,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferMap( std::scoped_lock Guard(Buffer->Mutex); char *ZeHandleSrc; - UR_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device)); + UR_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device, + EventWaitList, NumEventsInWaitList)); if (Buffer->MapHostPtr) { *RetMap = Buffer->MapHostPtr + Offset; @@ -1030,7 +1044,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferMap( const auto &WaitList = (*Event)->WaitList; char *ZeHandleSrc; - UR_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device)); + UR_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device, + EventWaitList, NumEventsInWaitList)); UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, NumEventsInWaitList, EventWaitList, @@ -1127,7 +1142,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemUnmap( char *ZeHandleDst; UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); std::scoped_lock Guard(Buffer->Mutex); if (Buffer->MapHostPtr) @@ -1162,7 +1178,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemUnmap( char *ZeHandleDst; UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, NumEventsInWaitList, EventWaitList, @@ -1646,7 +1663,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( // allocation. char *ZeHandleDst; UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Context->Devices[0])); + Context->Devices[0], nullptr, 0u)); if (Buffer->OnHost) { // Do a host to host copy. // For an imported HostPtr the copy is unneeded. @@ -1688,7 +1705,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease( char *ZeHandleImage; auto Image = static_cast<_ur_image *>(Mem); if (Image->OwnNativeHandle) { - UR_CALL(Mem->getZeHandle(ZeHandleImage, ur_mem_handle_t_::write_only)); + UR_CALL(Mem->getZeHandle(ZeHandleImage, ur_mem_handle_t_::write_only, + nullptr, nullptr, 0u)); auto ZeResult = ZE_CALL_NOCHECK( zeImageDestroy, (ur_cast(ZeHandleImage))); // Gracefully handle the case that L0 was already unloaded. @@ -1748,7 +1766,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle( ) { std::shared_lock Guard(Mem->Mutex); char *ZeHandle = nullptr; - UR_CALL(Mem->getZeHandle(ZeHandle, ur_mem_handle_t_::read_write)); + UR_CALL(Mem->getZeHandle(ZeHandle, ur_mem_handle_t_::read_write, nullptr, + nullptr, 0u)); *NativeMem = ur_cast(ZeHandle); return UR_RESULT_SUCCESS; @@ -1839,8 +1858,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle( // represent the buffer in this context) copy the data to a newly // created device allocation. char *ZeHandleDst; - UR_CALL( - Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, Device)); + UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + Device, nullptr, 0u)); // Indicate that this buffer has the device buffer mapped to a native buffer // and track the native pointer such that the memory is synced later at @@ -1954,7 +1973,9 @@ static ur_result_t ZeDeviceMemAllocHelper(void **ResultPtr, } ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, - ur_device_handle_t Device) { + ur_device_handle_t Device, + const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) { // NOTE: There might be no valid allocation at all yet and we get // here from piEnqueueKernelLaunch that would be doing the buffer @@ -1971,7 +1992,8 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, // Sub-buffers don't maintain own allocations but rely on parent buffer. if (SubBuffer) { - UR_CALL(SubBuffer->Parent->getZeHandle(ZeHandle, AccessMode, Device)); + UR_CALL(SubBuffer->Parent->getZeHandle(ZeHandle, AccessMode, Device, + phWaitEvents, numWaitEvents)); ZeHandle += SubBuffer->Origin; // Still store the allocation info in the PI sub-buffer for // getZeHandlePtr to work. At least zeKernelSetArgumentValue needs to @@ -2043,7 +2065,8 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, // TODO: we can probably generalize this and share root-device // allocations by its own sub-devices even if not all other // devices in the context have the same root. - UR_CALL(getZeHandle(ZeHandle, AccessMode, UrContext->SingleRootDevice)); + UR_CALL(getZeHandle(ZeHandle, AccessMode, UrContext->SingleRootDevice, + phWaitEvents, numWaitEvents)); Allocation.ReleaseAction = allocation_t::keep; Allocation.ZeHandle = ZeHandle; Allocation.Valid = true; @@ -2085,7 +2108,8 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, char *ZeHandleSrc = nullptr; if (NeedCopy) { UR_CALL(getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - LastDeviceWithValidAllocation)); + LastDeviceWithValidAllocation, phWaitEvents, + numWaitEvents)); // It's possible with the single root-device contexts that // the buffer is represented by the single root-device // allocation and then skip the copy to itself. @@ -2094,6 +2118,33 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, } if (NeedCopy) { + // Wait on all dependency events passed in to ensure that the memory which + // is being init is updated correctly. + _ur_ze_event_list_t waitlist; + waitlist.ZeEventList = nullptr; + waitlist.Length = 0; + uint32_t EventListIndex = 0; + for (unsigned i = 0; i < numWaitEvents; ++i) { + if (phWaitEvents[i]->HostVisibleEvent) { + ZE2UR_CALL(zeEventHostSynchronize, + (phWaitEvents[i]->ZeEvent, UINT64_MAX)); + } else { + // Generate the waitlist for the Copy calls based on the passed in + // dependencies, if they exist for device only. + if (waitlist.ZeEventList == nullptr) { + waitlist.ZeEventList = new ze_event_handle_t[numWaitEvents]; + } + waitlist.ZeEventList[EventListIndex] = phWaitEvents[i]->ZeEvent; + waitlist.Length++; + EventListIndex++; + } + } + if (waitlist.Length > 0) { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (UrContext->ZeCommandListInit, waitlist.Length, + waitlist.ZeEventList)); + } + // Copy valid buffer data to this allocation. // TODO: see if we should better use peer's device allocation used // directly, if that capability is reported with zeDeviceCanAccessPeer, @@ -2131,7 +2182,7 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, if (!HostAllocation.Valid) { ZE2UR_CALL(zeCommandListAppendMemoryCopy, (UrContext->ZeCommandListInit, HostAllocation.ZeHandle, - ZeHandleSrc, Size, nullptr, 0, nullptr)); + ZeHandleSrc, Size, nullptr, 0u, nullptr)); // Mark the host allocation data as valid so it can be reused. // It will be invalidated below if the current access is not // read-only. @@ -2139,13 +2190,16 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, } ZE2UR_CALL(zeCommandListAppendMemoryCopy, (UrContext->ZeCommandListInit, ZeHandle, - HostAllocation.ZeHandle, Size, nullptr, 0, nullptr)); + HostAllocation.ZeHandle, Size, nullptr, 0u, nullptr)); } else { // Perform P2P copy. std::scoped_lock Lock(UrContext->ImmediateCommandListMutex); ZE2UR_CALL(zeCommandListAppendMemoryCopy, (UrContext->ZeCommandListInit, ZeHandle, ZeHandleSrc, Size, - nullptr, 0, nullptr)); + nullptr, 0u, nullptr)); + } + if (waitlist.ZeEventList) { + delete waitlist.ZeEventList; } } Allocation.Valid = true; @@ -2274,9 +2328,12 @@ _ur_buffer::_ur_buffer(ur_context_handle_t Context, size_t Size, ur_result_t _ur_buffer::getZeHandlePtr(char **&ZeHandlePtr, access_mode_t AccessMode, - ur_device_handle_t Device) { + ur_device_handle_t Device, + const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) { char *ZeHandle; - UR_CALL(getZeHandle(ZeHandle, AccessMode, Device)); + UR_CALL( + getZeHandle(ZeHandle, AccessMode, Device, phWaitEvents, numWaitEvents)); ZeHandlePtr = &Allocations[Device].ZeHandle; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/memory.hpp b/source/adapters/level_zero/memory.hpp index 43d548f16b..d4a0376eae 100644 --- a/source/adapters/level_zero/memory.hpp +++ b/source/adapters/level_zero/memory.hpp @@ -78,11 +78,15 @@ struct ur_mem_handle_t_ : _ur_object { // Get the Level Zero handle of the current memory object virtual ur_result_t getZeHandle(char *&ZeHandle, access_mode_t, - ur_device_handle_t Device = nullptr) = 0; + ur_device_handle_t Device, + const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) = 0; // Get a pointer to the Level Zero handle of the current memory object virtual ur_result_t getZeHandlePtr(char **&ZeHandlePtr, access_mode_t, - ur_device_handle_t Device = nullptr) = 0; + ur_device_handle_t Device, + const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) = 0; // Method to get type of the derived object (image or buffer) virtual bool isImage() const = 0; @@ -122,10 +126,13 @@ struct _ur_buffer final : ur_mem_handle_t_ { // the hood. // virtual ur_result_t getZeHandle(char *&ZeHandle, access_mode_t, - ur_device_handle_t Device = nullptr) override; - virtual ur_result_t - getZeHandlePtr(char **&ZeHandlePtr, access_mode_t, - ur_device_handle_t Device = nullptr) override; + ur_device_handle_t Device, + const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) override; + virtual ur_result_t getZeHandlePtr(char **&ZeHandlePtr, access_mode_t, + ur_device_handle_t Device, + const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) override; bool isImage() const override { return false; } bool isSubBuffer() const { return SubBuffer != std::nullopt; } @@ -203,12 +210,20 @@ struct _ur_image final : ur_mem_handle_t_ { } virtual ur_result_t getZeHandle(char *&ZeHandle, access_mode_t, - ur_device_handle_t = nullptr) override { + ur_device_handle_t, + const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) override { + std::ignore = phWaitEvents; + std::ignore = numWaitEvents; ZeHandle = reinterpret_cast(ZeImage); return UR_RESULT_SUCCESS; } virtual ur_result_t getZeHandlePtr(char **&ZeHandlePtr, access_mode_t, - ur_device_handle_t = nullptr) override { + ur_device_handle_t, + const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) override { + std::ignore = phWaitEvents; + std::ignore = numWaitEvents; ZeHandlePtr = reinterpret_cast(&ZeImage); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/platform.hpp b/source/adapters/level_zero/platform.hpp index fa15c88bdf..a498b1c75b 100644 --- a/source/adapters/level_zero/platform.hpp +++ b/source/adapters/level_zero/platform.hpp @@ -12,11 +12,18 @@ #include "common.hpp" #include "ur_api.h" #include "ze_api.h" +#include "zes_api.h" struct ur_device_handle_t_; typedef size_t DeviceId; +struct ur_zes_device_handle_data_t { + zes_device_handle_t ZesDevice; + uint32_t SubDeviceId; + ze_bool_t SubDevice = false; +}; + struct ur_platform_handle_t_ : public _ur_platform { ur_platform_handle_t_(ze_driver_handle_t Driver) : ZeDriver{Driver}, ZeApiVersion{ZE_API_VERSION_CURRENT} {} @@ -27,6 +34,11 @@ struct ur_platform_handle_t_ : public _ur_platform { // a pretty good fit to keep here. ze_driver_handle_t ZeDriver; + // Cache of the ZesDevices mapped to the ZeDevices for use in zes apis calls + // based on a ze device handle. + std::unordered_map + ZedeviceToZesDeviceMap; + // Given a multi driver scenario, the driver handle must be translated to the // internal driver handle to allow calls to driver experimental apis. ze_driver_handle_t ZeDriverHandleExpTranslated; diff --git a/test/conformance/device/device_adapter_level_zero.match b/test/conformance/device/device_adapter_level_zero.match index 162c342477..9989fbd774 100644 --- a/test/conformance/device/device_adapter_level_zero.match +++ b/test/conformance/device/device_adapter_level_zero.match @@ -1,3 +1,2 @@ urDeviceCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle {{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime -{{OPT}}urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GLOBAL_MEM_FREE diff --git a/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp b/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp index 2e8856ac97..f7617a2940 100644 --- a/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp +++ b/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp @@ -165,6 +165,9 @@ TEST_F(urMultiDeviceContextMemBufferTest, WriteRead) { ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0, buffer_size_bytes, out_vec.data(), 1, &e1, nullptr)); + + ASSERT_SUCCESS(urQueueFinish(queues[1])); + for (auto &a : out_vec) { ASSERT_EQ(a, fill_val); } @@ -186,6 +189,9 @@ TEST_F(urMultiDeviceContextMemBufferTest, FillRead) { ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0, buffer_size_bytes, out_vec.data(), 1, &e1, nullptr)); + + ASSERT_SUCCESS(urQueueFinish(queues[1])); + for (auto &a : out_vec) { ASSERT_EQ(a, fill_val); } @@ -219,6 +225,9 @@ TEST_F(urMultiDeviceContextMemBufferTest, WriteKernelRead) { ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[0], buffer, false, 0, buffer_size_bytes, out_vec.data(), 1, &e2, nullptr)); + + ASSERT_SUCCESS(urQueueFinish(queues[0])); + for (auto &a : out_vec) { ASSERT_EQ(a, fill_val + 1); } @@ -257,6 +266,9 @@ TEST_F(urMultiDeviceContextMemBufferTest, WriteKernelKernelRead) { ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0, buffer_size_bytes, out_vec.data(), 1, &e3, nullptr)); + + ASSERT_SUCCESS(urQueueFinish(queues[1])); + for (auto &a : out_vec) { ASSERT_EQ(a, fill_val + 2); }