diff --git a/include/ur_api.h b/include/ur_api.h index fc36731851..a7c7fefae6 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -5776,10 +5776,10 @@ typedef enum ur_command_t { UR_COMMAND_EXTERNAL_SEMAPHORE_SIGNAL_EXP = 0x2001, ///< Event created by ::urBindlessImagesSignalExternalSemaphoreExp UR_COMMAND_TIMESTAMP_RECORDING_EXP = 0x2002, ///< Event created by ::urEnqueueTimestampRecordingExp UR_COMMAND_ENQUEUE_NATIVE_EXP = 0x2004, ///< Event created by ::urEnqueueNativeCommandExp - UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP = 0x2008, ///< Event created by ::urEnqueueDeviceAllocExp - UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP = 0x2010, ///< Event created by ::urEnqueueSharedAllocExp - UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP = 0x2011, ///< Event created by ::urEnqueueHostAllocExp - UR_COMMAND_ENQUEUE_USM_FREE_EXP = 0x2012, ///< Event created by ::urEnqueueFreeExp + UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP = 0x2008, ///< Event created by ::urEnqueueUSMDeviceAllocExp + UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP = 0x2010, ///< Event created by ::urEnqueueUSMSharedAllocExp + UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP = 0x2011, ///< Event created by ::urEnqueueUSMHostAllocExp + UR_COMMAND_ENQUEUE_USM_FREE_EXP = 0x2012, ///< Event created by ::urEnqueueUSMFreeExp /// @cond UR_COMMAND_FORCE_UINT32 = 0x7fffffff /// @endcond diff --git a/scripts/core/exp-async-alloc.yml b/scripts/core/exp-async-alloc.yml index 0ce5ab813e..7e23dde60e 100644 --- a/scripts/core/exp-async-alloc.yml +++ b/scripts/core/exp-async-alloc.yml @@ -41,16 +41,16 @@ name: $x_command_t etors: - name: ENQUEUE_USM_DEVICE_ALLOC_EXP value: "0x2008" - desc: Event created by $xEnqueueDeviceAllocExp + desc: Event created by $xEnqueueUSMDeviceAllocExp - name: ENQUEUE_USM_SHARED_ALLOC_EXP value: "0x2010" - desc: Event created by $xEnqueueSharedAllocExp + desc: Event created by $xEnqueueUSMSharedAllocExp - name: ENQUEUE_USM_HOST_ALLOC_EXP value: "0x2011" - desc: Event created by $xEnqueueHostAllocExp + desc: Event created by $xEnqueueUSMHostAllocExp - name: ENQUEUE_USM_FREE_EXP value: "0x2012" - desc: Event created by $xEnqueueFreeExp + desc: Event created by $xEnqueueUSMFreeExp --- #-------------------------------------------------------------------------- type: enum diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index b498992bdd..d16fc330e8 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -1327,7 +1327,7 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList, Device->Platform->ContextsMutex, std::defer_lock); if (IndirectAccessTrackingEnabled) { - // We are going to submit kernels for execution. If indirect access flag is + // We are going to submit kernels for execution. If indirect access flag is // set for a kernel then we need to make a snapshot of existing memory // allocations in all contexts in the platform. We need to lock the mutex // guarding the list of contexts in the platform to prevent creation of new diff --git a/source/adapters/level_zero/usm.cpp b/source/adapters/level_zero/usm.cpp index 28bdf233e8..c395815ed7 100644 --- a/source/adapters/level_zero/usm.cpp +++ b/source/adapters/level_zero/usm.cpp @@ -787,6 +787,255 @@ ur_result_t urUSMReleaseExp(ur_context_handle_t Context, void *HostPtr) { Context->getPlatform()->ZeDriverHandleExpTranslated, HostPtr); return UR_RESULT_SUCCESS; } + +enum class USMAllocType { Host = 0, Device = 1, Shared = 2 }; + +static ur_result_t USMAllocHelper(ur_context_handle_t Context, + ur_device_handle_t Device, size_t Size, + void **RetMem, USMAllocType Type) { + auto &Platform = Device->Platform; + + // TODO: Should alignemnt be passed in 'ur_exp_async_usm_alloc_properties_t'? + uint32_t Alignment = 0; + + std::shared_lock ContextLock(Context->Mutex, + std::defer_lock); + std::unique_lock IndirectAccessTrackingLock( + Platform->ContextsMutex, std::defer_lock); + if (IndirectAccessTrackingEnabled) { + IndirectAccessTrackingLock.lock(); + UR_CALL(ur::level_zero::urContextRetain(Context)); + } else { + ContextLock.lock(); + } + + umf_memory_pool_handle_t hPoolInternal = nullptr; + switch (Type) { + case USMAllocType::Host: + hPoolInternal = Context->HostMemPool.get(); + break; + case USMAllocType::Device: { + auto It = Context->DeviceMemPools.find(Device->ZeDevice); + if (It == Context->DeviceMemPools.end()) { + return UR_RESULT_ERROR_INVALID_VALUE; + } + hPoolInternal = It->second.get(); + } break; + case USMAllocType::Shared: { + auto It = Context->SharedMemPools.find(Device->ZeDevice); + if (It == Context->SharedMemPools.end()) { + return UR_RESULT_ERROR_INVALID_VALUE; + } + hPoolInternal = It->second.get(); + } break; + }; + + *RetMem = umfPoolAlignedMalloc(hPoolInternal, Size, Alignment); + if (*RetMem == nullptr) { + auto umfRet = umfPoolGetLastAllocationError(hPoolInternal); + return umf2urResult(umfRet); + } + + if (IndirectAccessTrackingEnabled) { + // Keep track of all memory allocations in the context + Context->MemAllocs.emplace(std::piecewise_construct, + std::forward_as_tuple(*RetMem), + std::forward_as_tuple(Context)); + } + + return UR_RESULT_SUCCESS; +} + +static ur_result_t enqueueUSMAllocHelper( + ur_queue_handle_t Queue, ur_usm_pool_handle_t Pool, const size_t Size, + const ur_exp_async_usm_alloc_properties_t *Properties, + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + void **RetMem, ur_event_handle_t *OutEvent, USMAllocType Type) { + std::ignore = Pool; + std::ignore = Properties; + + std::scoped_lock lock(Queue->Mutex); + + bool UseCopyEngine = false; + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + + // Get a new command list to be used on this call + ur_command_list_ptr_t CommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList)); + + ze_event_handle_t ZeEvent = nullptr; + ur_event_handle_t InternalEvent{}; + bool IsInternal = OutEvent == nullptr; + ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; + + ur_command_t CommandType = UR_COMMAND_FORCE_UINT32; + switch (Type) { + case USMAllocType::Host: + CommandType = UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP; + break; + case USMAllocType::Device: + CommandType = UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP; + break; + case USMAllocType::Shared: + CommandType = UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP; + break; + } + UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, + IsInternal, false)); + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + + // Allocate USM memory + auto Ret = USMAllocHelper(Queue->Context, Queue->Device, Size, RetMem, Type); + if (Ret) { + return Ret; + } + + // Signal that USM allocation event was finished + ZE2UR_CALL(zeCommandListAppendSignalEvent, (CommandList->first, ZeEvent)); + + UR_CALL(Queue->executeCommandList(CommandList, false)); + + return UR_RESULT_SUCCESS; +} + +ur_result_t urEnqueueUSMDeviceAllocExp( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_usm_pool_handle_t Pool, ///< [in][optional] USM pool descriptor + const size_t Size, ///< [in] minimum size in bytes of the USM memory object + ///< to be allocated + const ur_exp_async_usm_alloc_properties_t + *Properties, ///< [in][optional] pointer to the enqueue async alloc + ///< properties + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before the kernel execution. If nullptr, the + ///< numEventsInWaitList must be 0, indicating no wait + ///< events. + void **Mem, ///< [out] pointer to USM memory object + ur_event_handle_t *OutEvent ///< [out][optional] return an event object that + ///< identifies the async alloc +) { + return enqueueUSMAllocHelper(Queue, Pool, Size, Properties, + NumEventsInWaitList, EventWaitList, Mem, + OutEvent, USMAllocType::Device); +} + +ur_result_t urEnqueueUSMSharedAllocExp( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_usm_pool_handle_t Pool, ///< [in][optional] USM pool descriptor + const size_t Size, ///< [in] minimum size in bytes of the USM memory object + ///< to be allocated + const ur_exp_async_usm_alloc_properties_t + *Properties, ///< [in][optional] pointer to the enqueue async alloc + ///< properties + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before the kernel execution. If nullptr, the + ///< numEventsInWaitList must be 0, indicating no wait + ///< events. + void **Mem, ///< [out] pointer to USM memory object + ur_event_handle_t *OutEvent ///< [out][optional] return an event object that + ///< identifies the async alloc +) { + return enqueueUSMAllocHelper(Queue, Pool, Size, Properties, + NumEventsInWaitList, EventWaitList, Mem, + OutEvent, USMAllocType::Shared); +} + +ur_result_t urEnqueueUSMHostAllocExp( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_usm_pool_handle_t Pool, ///< [in][optional] USM pool descriptor + const size_t Size, ///< [in] minimum size in bytes of the USM memory object + ///< to be allocated + const ur_exp_async_usm_alloc_properties_t + *Properties, ///< [in][optional] pointer to the enqueue async alloc + ///< properties + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before the kernel execution. If nullptr, the + ///< numEventsInWaitList must be 0, indicating no wait + ///< events. + void **Mem, ///< [out] pointer to USM memory object + ur_event_handle_t *OutEvent ///< [out][optional] return an event object that + ///< identifies the async alloc +) { + return enqueueUSMAllocHelper(Queue, Pool, Size, Properties, + NumEventsInWaitList, EventWaitList, Mem, + OutEvent, USMAllocType::Host); +} + +ur_result_t urEnqueueUSMFreeExp( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_usm_pool_handle_t Pool, ///< [in][optional] USM pool descriptor + void *Mem, ///< [in] pointer to USM memory object + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before the kernel execution. If nullptr, the + ///< numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t *OutEvent ///< [out][optional] return an event object that + ///< identifies the async alloc +) { + std::ignore = Pool; + + std::scoped_lock lock(Queue->Mutex); + + bool UseCopyEngine = false; + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + + // Get a new command list to be used on this call + ur_command_list_ptr_t CommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList)); + + ze_event_handle_t ZeEvent = nullptr; + ur_event_handle_t InternalEvent{}; + bool IsInternal = OutEvent == nullptr; + ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; + + UR_CALL(createEventAndAssociateQueue(Queue, Event, + UR_COMMAND_ENQUEUE_USM_FREE_EXP, + CommandList, IsInternal, false)); + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + + const auto &ZeCommandList = CommandList->first; + const auto &WaitList = (*Event)->WaitList; + if (WaitList.Length) { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); + } + + // Wait for commands execution until USM can be freed + UR_CALL(Queue->executeCommandList(CommandList, true)); // Blocking + + // Free USM memory + auto Ret = USMFreeHelper(Queue->Context, Mem); + if (Ret) { + return Ret; + } + + // Signal that USM free event was finished + ZE2UR_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent)); + + UR_CALL(Queue->executeCommandList(CommandList, false)); + + return UR_RESULT_SUCCESS; +} } // namespace ur::level_zero static ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr) { diff --git a/test/adapters/level_zero/CMakeLists.txt b/test/adapters/level_zero/CMakeLists.txt index bfb02d37c2..b0df0ec602 100644 --- a/test/adapters/level_zero/CMakeLists.txt +++ b/test/adapters/level_zero/CMakeLists.txt @@ -84,6 +84,14 @@ if(UR_BUILD_ADAPTER_L0) target_link_libraries(test-adapter-level_zero_ipc PRIVATE ur_umf ) + + add_adapter_test(level_zero_enqueue_alloc + FIXTURE DEVICES + SOURCES + enqueueAlloc.cpp + ENVIRONMENT + "UR_ADAPTERS_FORCE_LOAD=\"$\"" + ) endif() if(UR_BUILD_ADAPTER_L0_V2) diff --git a/test/adapters/level_zero/enqueueAlloc.cpp b/test/adapters/level_zero/enqueueAlloc.cpp new file mode 100644 index 0000000000..fdf05a07d7 --- /dev/null +++ b/test/adapters/level_zero/enqueueAlloc.cpp @@ -0,0 +1,34 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +using urL0EnqueueAllocTest = uur::urQueueTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urL0EnqueueAllocTest); + +TEST_P(urL0EnqueueAllocTest, SuccessHostAlloc) { + ur_device_usm_access_capability_flags_t hostUSMSupport = 0; + ASSERT_SUCCESS(uur::GetDeviceUSMHostSupport(device, hostUSMSupport)); + if (!hostUSMSupport) { + GTEST_SKIP() << "Host USM is not supported."; + } + + void *Ptr = nullptr; + size_t allocSize = sizeof(int); + ur_event_handle_t AllocEvent = nullptr; + ASSERT_SUCCESS(urEnqueueUSMHostAllocExp(queue, nullptr, allocSize, nullptr, + 0, nullptr, &Ptr, &AllocEvent)); + ASSERT_SUCCESS(urQueueFinish(queue)); + ASSERT_NE(Ptr, nullptr); + ASSERT_NE(AllocEvent, nullptr); + + *(int *)Ptr = 0xC0FFEE; + + ur_event_handle_t FreeEvent = nullptr; + ASSERT_SUCCESS( + urEnqueueUSMFreeExp(queue, nullptr, Ptr, 0, nullptr, &FreeEvent)); + ASSERT_SUCCESS(urQueueFinish(queue)); + ASSERT_NE(FreeEvent, nullptr); +}